summaryrefslogtreecommitdiff
path: root/drivers/staging/lustre/lustre
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /drivers/staging/lustre/lustre
Initial import
Diffstat (limited to 'drivers/staging/lustre/lustre')
-rw-r--r--drivers/staging/lustre/lustre/Kconfig62
-rw-r--r--drivers/staging/lustre/lustre/Makefile2
-rw-r--r--drivers/staging/lustre/lustre/fid/Makefile3
-rw-r--r--drivers/staging/lustre/lustre/fid/fid_internal.h56
-rw-r--r--drivers/staging/lustre/lustre/fid/fid_lib.c95
-rw-r--r--drivers/staging/lustre/lustre/fid/fid_request.c572
-rw-r--r--drivers/staging/lustre/lustre/fid/lproc_fid.c225
-rw-r--r--drivers/staging/lustre/lustre/fld/Makefile3
-rw-r--r--drivers/staging/lustre/lustre/fld/fld_cache.c546
-rw-r--r--drivers/staging/lustre/lustre/fld/fld_internal.h193
-rw-r--r--drivers/staging/lustre/lustre/fld/fld_request.c526
-rw-r--r--drivers/staging/lustre/lustre/fld/lproc_fld.c172
-rw-r--r--drivers/staging/lustre/lustre/include/cl_object.h3287
-rw-r--r--drivers/staging/lustre/lustre/include/dt_object.h1499
-rw-r--r--drivers/staging/lustre/lustre/include/interval_tree.h124
-rw-r--r--drivers/staging/lustre/lustre/include/lclient.h433
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_compat25.h216
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_lite.h98
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h85
-rw-r--r--drivers/staging/lustre/lustre/include/linux/lustre_user.h70
-rw-r--r--drivers/staging/lustre/lustre/include/linux/obd.h125
-rw-r--r--drivers/staging/lustre/lustre/include/lprocfs_status.h1015
-rw-r--r--drivers/staging/lustre/lustre/include/lu_object.h1340
-rw-r--r--drivers/staging/lustre/lustre/include/lu_ref.h182
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/libiam.h145
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h121
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h2
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_errno.h215
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_idl.h3734
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h95
-rw-r--r--drivers/staging/lustre/lustre/include/lustre/lustre_user.h1179
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_acl.h49
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_capa.h305
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_cfg.h293
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_debug.h56
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_disk.h547
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_dlm.h1480
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_dlm_flags.h476
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_eacl.h95
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_export.h406
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_fid.h767
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_fld.h160
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_ha.h64
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_handles.h97
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_import.h385
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_intent.h62
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_lib.h666
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_lite.h150
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_log.h545
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_mdc.h191
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_mds.h81
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_net.h2967
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_param.h121
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_quota.h241
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_req_layout.h341
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_sec.h1147
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_ver.h26
-rw-r--r--drivers/staging/lustre/lustre/include/obd.h1496
-rw-r--r--drivers/staging/lustre/lustre/include/obd_cache.h39
-rw-r--r--drivers/staging/lustre/lustre/include/obd_cksum.h176
-rw-r--r--drivers/staging/lustre/lustre/include/obd_class.h1929
-rw-r--r--drivers/staging/lustre/lustre/include/obd_support.h862
-rw-r--r--drivers/staging/lustre/lustre/lclient/glimpse.c269
-rw-r--r--drivers/staging/lustre/lustre/lclient/lcommon_cl.c1287
-rw-r--r--drivers/staging/lustre/lustre/lclient/lcommon_misc.c199
-rw-r--r--drivers/staging/lustre/lustre/ldlm/interval_tree.c751
-rw-r--r--drivers/staging/lustre/lustre/ldlm/l_lock.c76
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_extent.c241
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_flock.c859
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c74
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_internal.h316
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_lib.c870
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_lock.c2322
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c1191
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_plain.c72
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_pool.c1455
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_request.c2294
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_resource.c1425
-rw-r--r--drivers/staging/lustre/lustre/libcfs/Makefile18
-rw-r--r--drivers/staging/lustre/lustre/libcfs/debug.c460
-rw-r--r--drivers/staging/lustre/lustre/libcfs/fail.c138
-rw-r--r--drivers/staging/lustre/lustre/libcfs/hash.c2098
-rw-r--r--drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c240
-rw-r--r--drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c224
-rw-r--r--drivers/staging/lustre/lustre/libcfs/libcfs_lock.c189
-rw-r--r--drivers/staging/lustre/lustre/libcfs/libcfs_mem.c202
-rw-r--r--drivers/staging/lustre/lustre/libcfs/libcfs_string.c562
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c1056
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c141
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c291
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h29
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c111
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c200
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-module.c183
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c217
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c623
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c275
-rw-r--r--drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h48
-rw-r--r--drivers/staging/lustre/lustre/libcfs/module.c976
-rw-r--r--drivers/staging/lustre/lustre/libcfs/nidstrings.c842
-rw-r--r--drivers/staging/lustre/lustre/libcfs/prng.c139
-rw-r--r--drivers/staging/lustre/lustre/libcfs/tracefile.c1196
-rw-r--r--drivers/staging/lustre/lustre/libcfs/tracefile.h340
-rw-r--r--drivers/staging/lustre/lustre/libcfs/workitem.c479
-rw-r--r--drivers/staging/lustre/lustre/llite/Makefile11
-rw-r--r--drivers/staging/lustre/lustre/llite/dcache.c363
-rw-r--r--drivers/staging/lustre/lustre/llite/dir.c1971
-rw-r--r--drivers/staging/lustre/lustre/llite/file.c3624
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_capa.c654
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_close.c393
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_internal.h1521
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_lib.c2354
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_mmap.c492
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_nfs.c335
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_rmtacl.c300
-rw-r--r--drivers/staging/lustre/lustre/llite/lloop.c877
-rw-r--r--drivers/staging/lustre/lustre/llite/lproc_llite.c1536
-rw-r--r--drivers/staging/lustre/lustre/llite/namei.c1178
-rw-r--r--drivers/staging/lustre/lustre/llite/remote_perm.c331
-rw-r--r--drivers/staging/lustre/lustre/llite/rw.c1289
-rw-r--r--drivers/staging/lustre/lustre/llite/rw26.c553
-rw-r--r--drivers/staging/lustre/lustre/llite/statahead.c1729
-rw-r--r--drivers/staging/lustre/lustre/llite/super25.c226
-rw-r--r--drivers/staging/lustre/lustre/llite/symlink.c170
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_dev.c547
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_internal.h62
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_io.c1209
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_lock.c85
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_object.c201
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_page.c551
-rw-r--r--drivers/staging/lustre/lustre/llite/xattr.c621
-rw-r--r--drivers/staging/lustre/lustre/llite/xattr_cache.c538
-rw-r--r--drivers/staging/lustre/lustre/lmv/Makefile3
-rw-r--r--drivers/staging/lustre/lustre/lmv/lmv_fld.c83
-rw-r--r--drivers/staging/lustre/lustre/lmv/lmv_intent.c323
-rw-r--r--drivers/staging/lustre/lustre/lmv/lmv_internal.h157
-rw-r--r--drivers/staging/lustre/lustre/lmv/lmv_obd.c2892
-rw-r--r--drivers/staging/lustre/lustre/lmv/lproc_lmv.c237
-rw-r--r--drivers/staging/lustre/lustre/lov/Makefile6
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_cl_internal.h839
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_dev.c528
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_ea.c363
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_internal.h319
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_io.c990
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_lock.c1198
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_merge.c186
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_obd.c2395
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_object.c1001
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_offset.c264
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_pack.c511
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_page.c232
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_pool.c673
-rw-r--r--drivers/staging/lustre/lustre/lov/lov_request.c773
-rw-r--r--drivers/staging/lustre/lustre/lov/lovsub_dev.c209
-rw-r--r--drivers/staging/lustre/lustre/lov/lovsub_io.c55
-rw-r--r--drivers/staging/lustre/lustre/lov/lovsub_lock.c466
-rw-r--r--drivers/staging/lustre/lustre/lov/lovsub_object.c164
-rw-r--r--drivers/staging/lustre/lustre/lov/lovsub_page.c71
-rw-r--r--drivers/staging/lustre/lustre/lov/lproc_lov.c311
-rw-r--r--drivers/staging/lustre/lustre/mdc/Makefile3
-rw-r--r--drivers/staging/lustre/lustre/mdc/lproc_mdc.c220
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_internal.h181
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_lib.c593
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_locks.c1313
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_reint.c483
-rw-r--r--drivers/staging/lustre/lustre/mdc/mdc_request.c2731
-rw-r--r--drivers/staging/lustre/lustre/mgc/Makefile3
-rw-r--r--drivers/staging/lustre/lustre/mgc/lproc_mgc.c80
-rw-r--r--drivers/staging/lustre/lustre/mgc/mgc_internal.h73
-rw-r--r--drivers/staging/lustre/lustre/mgc/mgc_request.c1762
-rw-r--r--drivers/staging/lustre/lustre/obdclass/Makefile11
-rw-r--r--drivers/staging/lustre/lustre/obdclass/acl.c548
-rw-r--r--drivers/staging/lustre/lustre/obdclass/capa.c421
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_internal.h121
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_io.c1669
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_lock.c2239
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_object.c1139
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_page.c1553
-rw-r--r--drivers/staging/lustre/lustre/obdclass/class_obd.c704
-rw-r--r--drivers/staging/lustre/lustre/obdclass/debug.c109
-rw-r--r--drivers/staging/lustre/lustre/obdclass/dt_object.c1059
-rw-r--r--drivers/staging/lustre/lustre/obdclass/genops.c1833
-rw-r--r--drivers/staging/lustre/lustre/obdclass/linux/linux-module.c449
-rw-r--r--drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c222
-rw-r--r--drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c405
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog.c1007
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_cat.c815
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_internal.h98
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_obd.c262
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_swab.c415
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c139
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lprocfs_status.c2059
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lu_object.c2192
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lu_ref.c50
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lustre_handles.c257
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lustre_peer.c217
-rw-r--r--drivers/staging/lustre/lustre/obdclass/obd_config.c1953
-rw-r--r--drivers/staging/lustre/lustre/obdclass/obd_mount.c1319
-rw-r--r--drivers/staging/lustre/lustre/obdclass/obdo.c362
-rw-r--r--drivers/staging/lustre/lustre/obdclass/statfs_pack.c75
-rw-r--r--drivers/staging/lustre/lustre/obdclass/uuid.c82
-rw-r--r--drivers/staging/lustre/lustre/obdecho/Makefile2
-rw-r--r--drivers/staging/lustre/lustre/obdecho/echo_client.c2197
-rw-r--r--drivers/staging/lustre/lustre/obdecho/echo_internal.h47
-rw-r--r--drivers/staging/lustre/lustre/obdecho/lproc_echo.c57
-rw-r--r--drivers/staging/lustre/lustre/osc/Makefile4
-rw-r--r--drivers/staging/lustre/lustre/osc/lproc_osc.c751
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_cache.c2944
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_cl_internal.h685
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_dev.c262
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_internal.h203
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_io.c819
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_lock.c1613
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_object.c271
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_page.c916
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_quota.c327
-rw-r--r--drivers/staging/lustre/lustre/osc/osc_request.c3379
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/Makefile20
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/client.c3149
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/connection.c241
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/errno.c380
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/events.c585
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/import.c1642
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/layout.c2442
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/llog_client.c366
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/llog_net.c72
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c1366
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/niobuf.c731
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/nrs.c1754
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c270
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/pack_generic.c2536
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/pers.c75
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/pinger.c678
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h312
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c171
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c811
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/recover.c379
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/sec.c2459
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c884
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/sec_config.c901
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/sec_gc.c252
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c199
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/sec_null.c458
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/sec_plain.c1013
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/service.c3105
-rw-r--r--drivers/staging/lustre/lustre/ptlrpc/wiretest.c4492
246 files changed, 174034 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
new file mode 100644
index 000000000..62c7bba75
--- /dev/null
+++ b/drivers/staging/lustre/lustre/Kconfig
@@ -0,0 +1,62 @@
+config LUSTRE_FS
+ tristate "Lustre file system client support"
+ depends on INET && m && !MIPS && !XTENSA && !SUPERH
+ select LNET
+ select CRYPTO
+ select CRYPTO_CRC32
+ select CRYPTO_CRC32_PCLMUL if X86
+ select CRYPTO_CRC32C
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ select CRYPTO_SHA256
+ select CRYPTO_SHA512
+ depends on MULTIUSER
+ help
+ This option enables Lustre file system client support. Choose Y
+ here if you want to access a Lustre file system cluster. To compile
+ this file system support as a module, choose M here: the module will
+ be called lustre.
+
+ To mount Lustre file systems, you also need to install the user space
+ mount.lustre and other user space commands which can be found in the
+ lustre-client package, available from
+ http://downloads.whamcloud.com/public/lustre/
+
+ Lustre file system is the most popular cluster file system in high
+ performance computing. Source code of both kernel space and user space
+ Lustre components can also be found at
+ http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+ If unsure, say N.
+
+ See also http://wiki.lustre.org/
+
+config LUSTRE_OBD_MAX_IOCTL_BUFFER
+ int "Lustre obd max ioctl buffer bytes (default 8KB)"
+ depends on LUSTRE_FS
+ default 8192
+ help
+ This option defines the maximum size of buffer in bytes that user space
+ applications can pass to Lustre kernel module through ioctl interface.
+
+ If unsure, use default.
+
+config LUSTRE_DEBUG_EXPENSIVE_CHECK
+ bool "Enable Lustre DEBUG checks"
+ depends on LUSTRE_FS
+ help
+ This option is mainly for debug purpose. It enables Lustre code to do
+ expensive checks that may have a performance impact.
+
+ Use with caution. If unsure, say N.
+
+config LUSTRE_TRANSLATE_ERRNOS
+ bool
+ depends on LUSTRE_FS && !X86
+ default y
+
+config LUSTRE_LLITE_LLOOP
+ tristate "Lustre virtual block device"
+ depends on LUSTRE_FS && BLOCK
+ depends on !PPC_64K_PAGES && !ARM64_64K_PAGES && !MICROBLAZE_64K_PAGES && !PAGE_SIZE_64KB && !IA64_PAGE_SIZE_64KB && !PARISC_PAGE_SIZE_64KB
+ default m
diff --git a/drivers/staging/lustre/lustre/Makefile b/drivers/staging/lustre/lustre/Makefile
new file mode 100644
index 000000000..35d8b0b2d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs/ obdclass/ ptlrpc/ fld/ osc/ mgc/ \
+ fid/ lov/ mdc/ lmv/ llite/ obdecho/
diff --git a/drivers/staging/lustre/lustre/fid/Makefile b/drivers/staging/lustre/lustre/fid/Makefile
new file mode 100644
index 000000000..5513ce416
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += fid.o
+fid-y := fid_request.o fid_lib.o
+fid-$(CONFIG_PROC_FS) += lproc_fid.o
diff --git a/drivers/staging/lustre/lustre/fid/fid_internal.h b/drivers/staging/lustre/lustre/fid/fid_internal.h
new file mode 100644
index 000000000..b5e8da895
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_internal.h
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include "../include/lustre/lustre_idl.h"
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* Functions used internally in module. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+ const struct lu_env *env);
+
+#if defined(CONFIG_PROC_FS)
+extern struct lprocfs_vars seq_client_proc_list[];
+#endif
+
+extern struct proc_dir_entry *seq_type_proc_dir;
+
+#endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c
new file mode 100644
index 000000000..dd65159eb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_lib.c
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_fid.h"
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID: seq:64 [2^33,2^64-1] oid:32 ver:32
+ * IGIF : 0:32, ino:32 gen:32 0:32
+ * IDIF : 0:31, 1:1, ost-index:16, objd:48 0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+ FID_SEQ_NORMAL,
+ (__u64)~0ULL
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE);
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+ 0,
+ 0
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE);
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+ .f_oid = FID_OID_SPECIAL_BFL,
+ .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+ .f_oid = FID_OID_DOT_LUSTRE,
+ .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+ .f_oid = FID_OID_DOT_LUSTRE_OBF,
+ .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
new file mode 100644
index 000000000..063441abf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/fid_request.c
@@ -0,0 +1,572 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+/* mdc RPC locks */
+#include "../include/lustre_mdc.h"
+#include "fid_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+ struct lu_seq_range *output, __u32 opc,
+ const char *opcname)
+{
+ struct obd_export *exp = seq->lcs_exp;
+ struct ptlrpc_request *req;
+ struct lu_seq_range *out, *in;
+ __u32 *op;
+ unsigned int debug_mask;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+ LUSTRE_MDS_VERSION, SEQ_QUERY);
+ if (req == NULL)
+ return -ENOMEM;
+
+ /* Init operation code */
+ op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+ *op = opc;
+
+ /* Zero out input range, this is not recovery yet. */
+ in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+ range_init(in);
+
+ ptlrpc_request_set_replen(req);
+
+ in->lsr_index = seq->lcs_space.lsr_index;
+ if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+ fld_range_set_mdt(in);
+ else
+ fld_range_set_ost(in);
+
+ if (opc == SEQ_ALLOC_SUPER) {
+ req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+ req->rq_reply_portal = MDC_REPLY_PORTAL;
+ /* During allocating super sequence for data object,
+ * the current thread might hold the export of MDT0(MDT0
+ * precreating objects on this OST), and it will send the
+ * request to MDT0 here, so we can not keep resending the
+ * request here, otherwise if MDT0 is failed(umounted),
+ * it can not release the export of MDT0 */
+ if (seq->lcs_type == LUSTRE_SEQ_DATA)
+ req->rq_no_delay = req->rq_no_resend = 1;
+ debug_mask = D_CONSOLE;
+ } else {
+ if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+ req->rq_request_portal = SEQ_METADATA_PORTAL;
+ else
+ req->rq_request_portal = SEQ_DATA_PORTAL;
+ debug_mask = D_INFO;
+ }
+
+ ptlrpc_at_set_req_timeout(req);
+
+ if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+ mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+ rc = ptlrpc_queue_wait(req);
+ if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+ mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+ if (rc)
+ goto out_req;
+
+ out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+ *output = *out;
+
+ if (!range_is_sane(output)) {
+ CERROR("%s: Invalid range received from server: "
+ DRANGE"\n", seq->lcs_name, PRANGE(output));
+ rc = -EINVAL;
+ goto out_req;
+ }
+
+ if (range_is_exhausted(output)) {
+ CERROR("%s: Range received from server is exhausted: "
+ DRANGE"]\n", seq->lcs_name, PRANGE(output));
+ rc = -EINVAL;
+ goto out_req;
+ }
+
+ CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+ seq->lcs_name, opcname, PRANGE(output));
+
+out_req:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+ const struct lu_env *env)
+{
+ int rc;
+
+ mutex_lock(&seq->lcs_mutex);
+
+ if (seq->lcs_srv) {
+ rc = 0;
+ } else {
+ /* Check whether the connection to seq controller has been
+ * setup (lcs_exp != NULL) */
+ if (seq->lcs_exp == NULL) {
+ mutex_unlock(&seq->lcs_mutex);
+ return -EINPROGRESS;
+ }
+
+ rc = seq_client_rpc(seq, &seq->lcs_space,
+ SEQ_ALLOC_SUPER, "super");
+ }
+ mutex_unlock(&seq->lcs_mutex);
+ return rc;
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+ struct lu_client_seq *seq)
+{
+ int rc;
+
+ if (seq->lcs_srv) {
+ rc = 0;
+ } else {
+ do {
+ /* If meta server return -EINPROGRESS or EAGAIN,
+ * it means meta server might not be ready to
+ * allocate super sequence from sequence controller
+ * (MDT0)yet */
+ rc = seq_client_rpc(seq, &seq->lcs_space,
+ SEQ_ALLOC_META, "meta");
+ } while (rc == -EINPROGRESS || rc == -EAGAIN);
+ }
+
+ return rc;
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+ struct lu_client_seq *seq, u64 *seqnr)
+{
+ int rc;
+
+ LASSERT(range_is_sane(&seq->lcs_space));
+
+ if (range_is_exhausted(&seq->lcs_space)) {
+ rc = seq_client_alloc_meta(env, seq);
+ if (rc) {
+ CERROR("%s: Can't allocate new meta-sequence, rc %d\n",
+ seq->lcs_name, rc);
+ return rc;
+ } else {
+ CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+ seq->lcs_name, PRANGE(&seq->lcs_space));
+ }
+ } else {
+ rc = 0;
+ }
+
+ LASSERT(!range_is_exhausted(&seq->lcs_space));
+ *seqnr = seq->lcs_space.lsr_start;
+ seq->lcs_space.lsr_start += 1;
+
+ CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name,
+ *seqnr);
+
+ return rc;
+}
+
+static int seq_fid_alloc_prep(struct lu_client_seq *seq,
+ wait_queue_t *link)
+{
+ if (seq->lcs_update) {
+ add_wait_queue(&seq->lcs_waitq, link);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&seq->lcs_mutex);
+
+ schedule();
+
+ mutex_lock(&seq->lcs_mutex);
+ remove_wait_queue(&seq->lcs_waitq, link);
+ set_current_state(TASK_RUNNING);
+ return -EAGAIN;
+ }
+ ++seq->lcs_update;
+ mutex_unlock(&seq->lcs_mutex);
+ return 0;
+}
+
+static void seq_fid_alloc_fini(struct lu_client_seq *seq)
+{
+ LASSERT(seq->lcs_update == 1);
+ mutex_lock(&seq->lcs_mutex);
+ --seq->lcs_update;
+ wake_up(&seq->lcs_waitq);
+}
+
+/**
+ * Allocate the whole seq to the caller.
+ **/
+int seq_client_get_seq(const struct lu_env *env,
+ struct lu_client_seq *seq, u64 *seqnr)
+{
+ wait_queue_t link;
+ int rc;
+
+ LASSERT(seqnr != NULL);
+ mutex_lock(&seq->lcs_mutex);
+ init_waitqueue_entry(&link, current);
+
+ while (1) {
+ rc = seq_fid_alloc_prep(seq, &link);
+ if (rc == 0)
+ break;
+ }
+
+ rc = seq_client_alloc_seq(env, seq, seqnr);
+ if (rc) {
+ CERROR("%s: Can't allocate new sequence, rc %d\n",
+ seq->lcs_name, rc);
+ seq_fid_alloc_fini(seq);
+ mutex_unlock(&seq->lcs_mutex);
+ return rc;
+ }
+
+ CDEBUG(D_INFO, "%s: allocate sequence [0x%16.16Lx]\n",
+ seq->lcs_name, *seqnr);
+
+ /* Since the caller require the whole seq,
+ * so marked this seq to be used */
+ if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+ seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+ else
+ seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+ seq->lcs_fid.f_seq = *seqnr;
+ seq->lcs_fid.f_ver = 0;
+ /*
+ * Inform caller that sequence switch is performed to allow it
+ * to setup FLD for it.
+ */
+ seq_fid_alloc_fini(seq);
+ mutex_unlock(&seq->lcs_mutex);
+
+ return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+int seq_client_alloc_fid(const struct lu_env *env,
+ struct lu_client_seq *seq, struct lu_fid *fid)
+{
+ wait_queue_t link;
+ int rc;
+
+ LASSERT(seq != NULL);
+ LASSERT(fid != NULL);
+
+ init_waitqueue_entry(&link, current);
+ mutex_lock(&seq->lcs_mutex);
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+ seq->lcs_fid.f_oid = seq->lcs_width;
+
+ while (1) {
+ u64 seqnr;
+
+ if (!fid_is_zero(&seq->lcs_fid) &&
+ fid_oid(&seq->lcs_fid) < seq->lcs_width) {
+ /* Just bump last allocated fid and return to caller. */
+ seq->lcs_fid.f_oid += 1;
+ rc = 0;
+ break;
+ }
+
+ rc = seq_fid_alloc_prep(seq, &link);
+ if (rc)
+ continue;
+
+ rc = seq_client_alloc_seq(env, seq, &seqnr);
+ if (rc) {
+ CERROR("%s: Can't allocate new sequence, rc %d\n",
+ seq->lcs_name, rc);
+ seq_fid_alloc_fini(seq);
+ mutex_unlock(&seq->lcs_mutex);
+ return rc;
+ }
+
+ CDEBUG(D_INFO, "%s: Switch to sequence [0x%16.16Lx]\n",
+ seq->lcs_name, seqnr);
+
+ seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+ seq->lcs_fid.f_seq = seqnr;
+ seq->lcs_fid.f_ver = 0;
+
+ /*
+ * Inform caller that sequence switch is performed to allow it
+ * to setup FLD for it.
+ */
+ rc = 1;
+
+ seq_fid_alloc_fini(seq);
+ break;
+ }
+
+ *fid = seq->lcs_fid;
+ mutex_unlock(&seq->lcs_mutex);
+
+ CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name, PFID(fid));
+ return rc;
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+ wait_queue_t link;
+
+ LASSERT(seq != NULL);
+ init_waitqueue_entry(&link, current);
+ mutex_lock(&seq->lcs_mutex);
+
+ while (seq->lcs_update) {
+ add_wait_queue(&seq->lcs_waitq, &link);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&seq->lcs_mutex);
+
+ schedule();
+
+ mutex_lock(&seq->lcs_mutex);
+ remove_wait_queue(&seq->lcs_waitq, &link);
+ set_current_state(TASK_RUNNING);
+ }
+
+ fid_zero(&seq->lcs_fid);
+ /**
+ * this id shld not be used for seq range allocation.
+ * set to -1 for dgb check.
+ */
+
+ seq->lcs_space.lsr_index = -1;
+
+ range_init(&seq->lcs_space);
+ mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+#if defined(CONFIG_PROC_FS)
+ if (seq->lcs_proc_dir) {
+ if (!IS_ERR(seq->lcs_proc_dir))
+ lprocfs_remove(&seq->lcs_proc_dir);
+ seq->lcs_proc_dir = NULL;
+ }
+#endif /* CONFIG_PROC_FS */
+}
+
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+#if defined(CONFIG_PROC_FS)
+ int rc;
+
+ seq->lcs_proc_dir = lprocfs_register(seq->lcs_name,
+ seq_type_proc_dir,
+ NULL, NULL);
+
+ if (IS_ERR(seq->lcs_proc_dir)) {
+ CERROR("%s: LProcFS failed in seq-init\n",
+ seq->lcs_name);
+ rc = PTR_ERR(seq->lcs_proc_dir);
+ return rc;
+ }
+
+ rc = lprocfs_add_vars(seq->lcs_proc_dir,
+ seq_client_proc_list, seq);
+ if (rc) {
+ CERROR("%s: Can't init sequence manager proc, rc %d\n",
+ seq->lcs_name, rc);
+ goto out_cleanup;
+ }
+
+ return 0;
+
+out_cleanup:
+ seq_client_proc_fini(seq);
+ return rc;
+
+#else /* CONFIG_PROC_FS */
+ return 0;
+#endif
+}
+
+int seq_client_init(struct lu_client_seq *seq,
+ struct obd_export *exp,
+ enum lu_cli_type type,
+ const char *prefix,
+ struct lu_server_seq *srv)
+{
+ int rc;
+
+ LASSERT(seq != NULL);
+ LASSERT(prefix != NULL);
+
+ seq->lcs_srv = srv;
+ seq->lcs_type = type;
+
+ mutex_init(&seq->lcs_mutex);
+ if (type == LUSTRE_SEQ_METADATA)
+ seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+ else
+ seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+ init_waitqueue_head(&seq->lcs_waitq);
+ /* Make sure that things are clear before work is started. */
+ seq_client_flush(seq);
+
+ if (exp != NULL)
+ seq->lcs_exp = class_export_get(exp);
+ else if (type == LUSTRE_SEQ_METADATA)
+ LASSERT(seq->lcs_srv != NULL);
+
+ snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+ "cli-%s", prefix);
+
+ rc = seq_client_proc_init(seq);
+ if (rc)
+ seq_client_fini(seq);
+ return rc;
+}
+EXPORT_SYMBOL(seq_client_init);
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+ seq_client_proc_fini(seq);
+
+ if (seq->lcs_exp != NULL) {
+ class_export_put(seq->lcs_exp);
+ seq->lcs_exp = NULL;
+ }
+
+ seq->lcs_srv = NULL;
+}
+EXPORT_SYMBOL(seq_client_fini);
+
+int client_fid_init(struct obd_device *obd,
+ struct obd_export *exp, enum lu_cli_type type)
+{
+ struct client_obd *cli = &obd->u.cli;
+ char *prefix;
+ int rc;
+
+ OBD_ALLOC_PTR(cli->cl_seq);
+ if (cli->cl_seq == NULL)
+ return -ENOMEM;
+
+ OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+ if (prefix == NULL) {
+ rc = -ENOMEM;
+ goto out_free_seq;
+ }
+
+ snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+ /* Init client side sequence-manager */
+ rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+ OBD_FREE(prefix, MAX_OBD_NAME + 5);
+ if (rc)
+ goto out_free_seq;
+
+ return rc;
+out_free_seq:
+ OBD_FREE_PTR(cli->cl_seq);
+ cli->cl_seq = NULL;
+ return rc;
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+ struct client_obd *cli = &obd->u.cli;
+
+ if (cli->cl_seq != NULL) {
+ seq_client_fini(cli->cl_seq);
+ OBD_FREE_PTR(cli->cl_seq);
+ cli->cl_seq = NULL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+struct proc_dir_entry *seq_type_proc_dir;
+
+static int __init fid_mod_init(void)
+{
+ seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+ proc_lustre_root,
+ NULL, NULL);
+ return PTR_ERR_OR_ZERO(seq_type_proc_dir);
+}
+
+static void __exit fid_mod_exit(void)
+{
+ if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+ lprocfs_remove(&seq_type_proc_dir);
+ seq_type_proc_dir = NULL;
+ }
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Module");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.1.0");
+
+module_init(fid_mod_init);
+module_exit(fid_mod_exit);
diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c
new file mode 100644
index 000000000..783939dbd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fid/lproc_fid.c
@@ -0,0 +1,225 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/dt_object.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fid.h"
+#include "fid_internal.h"
+
+/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */
+#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64))
+/*
+ * Note: this function is only used for testing, it is no safe for production
+ * use.
+ */
+static int lprocfs_fid_write_common(const char __user *buffer, size_t count,
+ struct lu_seq_range *range)
+{
+ struct lu_seq_range tmp;
+ int rc;
+ char kernbuf[MAX_FID_RANGE_STRLEN];
+
+ LASSERT(range != NULL);
+
+ if (count >= sizeof(kernbuf))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+
+ kernbuf[count] = 0;
+
+ if (count == 5 && strcmp(kernbuf, "clear") == 0) {
+ memset(range, 0, sizeof(*range));
+ return count;
+ }
+
+ /* of the form "[0x0000000240000400 - 0x000000028000400]" */
+ rc = sscanf(kernbuf, "[%llx - %llx]\n",
+ (unsigned long long *)&tmp.lsr_start,
+ (unsigned long long *)&tmp.lsr_end);
+ if (!range_is_sane(&tmp) || range_is_zero(&tmp) ||
+ tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end)
+ return -EINVAL;
+ *range = tmp;
+ return count;
+}
+
+/* Client side procfs stuff */
+static ssize_t lprocfs_fid_space_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct lu_client_seq *seq;
+ int rc;
+
+ seq = ((struct seq_file *)file->private_data)->private;
+ LASSERT(seq != NULL);
+
+ mutex_lock(&seq->lcs_mutex);
+ rc = lprocfs_fid_write_common(buffer, count, &seq->lcs_space);
+
+ if (rc == 0) {
+ CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+ seq->lcs_name, PRANGE(&seq->lcs_space));
+ }
+
+ mutex_unlock(&seq->lcs_mutex);
+
+ return count;
+}
+
+static int
+lprocfs_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+ struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+ LASSERT(seq != NULL);
+
+ mutex_lock(&seq->lcs_mutex);
+ seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space));
+ mutex_unlock(&seq->lcs_mutex);
+
+ return 0;
+}
+
+static ssize_t lprocfs_fid_width_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct lu_client_seq *seq;
+ __u64 max;
+ int rc, val;
+
+ seq = ((struct seq_file *)file->private_data)->private;
+ LASSERT(seq != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ mutex_lock(&seq->lcs_mutex);
+ if (seq->lcs_type == LUSTRE_SEQ_DATA)
+ max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+ else
+ max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+ if (val <= max && val > 0) {
+ seq->lcs_width = val;
+
+ if (rc == 0) {
+ CDEBUG(D_INFO, "%s: Sequence size: %llu\n",
+ seq->lcs_name, seq->lcs_width);
+ }
+ }
+
+ mutex_unlock(&seq->lcs_mutex);
+
+ return count;
+}
+
+static int
+lprocfs_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+ struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+ LASSERT(seq != NULL);
+
+ mutex_lock(&seq->lcs_mutex);
+ seq_printf(m, "%llu\n", seq->lcs_width);
+ mutex_unlock(&seq->lcs_mutex);
+
+ return 0;
+}
+
+static int
+lprocfs_fid_fid_seq_show(struct seq_file *m, void *unused)
+{
+ struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+
+ LASSERT(seq != NULL);
+
+ mutex_lock(&seq->lcs_mutex);
+ seq_printf(m, DFID "\n", PFID(&seq->lcs_fid));
+ mutex_unlock(&seq->lcs_mutex);
+
+ return 0;
+}
+
+static int
+lprocfs_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+ struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+ struct client_obd *cli;
+
+ LASSERT(seq != NULL);
+
+ if (seq->lcs_exp != NULL) {
+ cli = &seq->lcs_exp->exp_obd->u.cli;
+ seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+ } else {
+ seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+ }
+
+ return 0;
+}
+
+LPROC_SEQ_FOPS(lprocfs_fid_space);
+LPROC_SEQ_FOPS(lprocfs_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_server);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_fid);
+
+struct lprocfs_vars seq_client_proc_list[] = {
+ { "space", &lprocfs_fid_space_fops },
+ { "width", &lprocfs_fid_width_fops },
+ { "server", &lprocfs_fid_server_fops },
+ { "fid", &lprocfs_fid_fid_fops },
+ { NULL }
+};
diff --git a/drivers/staging/lustre/lustre/fld/Makefile b/drivers/staging/lustre/lustre/fld/Makefile
new file mode 100644
index 000000000..2bbf08433
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += fld.o
+fld-y := fld_request.o fld_cache.o
+fld-$(CONFIG_PROC_FS) += lproc_fld.o
diff --git a/drivers/staging/lustre/lustre/fld/fld_cache.c b/drivers/staging/lustre/lustre/fld/fld_cache.c
new file mode 100644
index 000000000..0d0a73745
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_cache.c
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+#include <asm/div64.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_ver.h"
+#include "../include/obd_support.h"
+#include "../include/lprocfs_status.h"
+
+#include "../include/dt_object.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name,
+ int cache_size, int cache_threshold)
+{
+ struct fld_cache *cache;
+
+ LASSERT(name != NULL);
+ LASSERT(cache_threshold < cache_size);
+
+ OBD_ALLOC_PTR(cache);
+ if (cache == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&cache->fci_entries_head);
+ INIT_LIST_HEAD(&cache->fci_lru);
+
+ cache->fci_cache_count = 0;
+ rwlock_init(&cache->fci_lock);
+
+ strlcpy(cache->fci_name, name,
+ sizeof(cache->fci_name));
+
+ cache->fci_cache_size = cache_size;
+ cache->fci_threshold = cache_threshold;
+
+ /* Init fld cache info. */
+ memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+ CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+ cache->fci_name, cache_size, cache_threshold);
+
+ return cache;
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+ __u64 pct;
+
+ LASSERT(cache != NULL);
+ fld_cache_flush(cache);
+
+ if (cache->fci_stat.fst_count > 0) {
+ pct = cache->fci_stat.fst_cache * 100;
+ do_div(pct, cache->fci_stat.fst_count);
+ } else {
+ pct = 0;
+ }
+
+ CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+ CDEBUG(D_INFO, " Total reqs: %llu\n", cache->fci_stat.fst_count);
+ CDEBUG(D_INFO, " Cache reqs: %llu\n", cache->fci_stat.fst_cache);
+ CDEBUG(D_INFO, " Cache hits: %llu%%\n", pct);
+
+ OBD_FREE_PTR(cache);
+}
+
+/**
+ * delete given node from list.
+ */
+void fld_cache_entry_delete(struct fld_cache *cache,
+ struct fld_cache_entry *node)
+{
+ list_del(&node->fce_list);
+ list_del(&node->fce_lru);
+ cache->fci_cache_count--;
+ OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+ struct fld_cache_entry *f_curr;
+ struct fld_cache_entry *f_next;
+ struct lu_seq_range *c_range;
+ struct lu_seq_range *n_range;
+ struct list_head *head = &cache->fci_entries_head;
+
+restart_fixup:
+
+ list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+ c_range = &f_curr->fce_range;
+ n_range = &f_next->fce_range;
+
+ LASSERT(range_is_sane(c_range));
+ if (&f_next->fce_list == head)
+ break;
+
+ if (c_range->lsr_flags != n_range->lsr_flags)
+ continue;
+
+ LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+ "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+ PRANGE(c_range), PRANGE(n_range));
+
+ /* check merge possibility with next range */
+ if (c_range->lsr_end == n_range->lsr_start) {
+ if (c_range->lsr_index != n_range->lsr_index)
+ continue;
+ n_range->lsr_start = c_range->lsr_start;
+ fld_cache_entry_delete(cache, f_curr);
+ continue;
+ }
+
+ /* check if current range overlaps with next range. */
+ if (n_range->lsr_start < c_range->lsr_end) {
+ if (c_range->lsr_index == n_range->lsr_index) {
+ n_range->lsr_start = c_range->lsr_start;
+ n_range->lsr_end = max(c_range->lsr_end,
+ n_range->lsr_end);
+ fld_cache_entry_delete(cache, f_curr);
+ } else {
+ if (n_range->lsr_end <= c_range->lsr_end) {
+ *n_range = *c_range;
+ fld_cache_entry_delete(cache, f_curr);
+ } else
+ n_range->lsr_start = c_range->lsr_end;
+ }
+
+ /* we could have overlap over next
+ * range too. better restart. */
+ goto restart_fixup;
+ }
+
+ /* kill duplicates */
+ if (c_range->lsr_start == n_range->lsr_start &&
+ c_range->lsr_end == n_range->lsr_end)
+ fld_cache_entry_delete(cache, f_curr);
+ }
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+ struct fld_cache_entry *f_new,
+ struct list_head *pos)
+{
+ list_add(&f_new->fce_list, pos);
+ list_add(&f_new->fce_lru, &cache->fci_lru);
+
+ cache->fci_cache_count++;
+ fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+ struct fld_cache_entry *flde;
+ struct list_head *curr;
+ int num = 0;
+
+ LASSERT(cache != NULL);
+
+ if (cache->fci_cache_count < cache->fci_cache_size)
+ return 0;
+
+ curr = cache->fci_lru.prev;
+
+ while (cache->fci_cache_count + cache->fci_threshold >
+ cache->fci_cache_size && curr != &cache->fci_lru) {
+
+ flde = list_entry(curr, struct fld_cache_entry, fce_lru);
+ curr = curr->prev;
+ fld_cache_entry_delete(cache, flde);
+ num++;
+ }
+
+ CDEBUG(D_INFO, "%s: FLD cache - Shrunk by %d entries\n",
+ cache->fci_name, num);
+
+ return 0;
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+ write_lock(&cache->fci_lock);
+ cache->fci_cache_size = 0;
+ fld_cache_shrink(cache);
+ write_unlock(&cache->fci_lock);
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+static void fld_cache_punch_hole(struct fld_cache *cache,
+ struct fld_cache_entry *f_curr,
+ struct fld_cache_entry *f_new)
+{
+ const struct lu_seq_range *range = &f_new->fce_range;
+ const u64 new_start = range->lsr_start;
+ const u64 new_end = range->lsr_end;
+ struct fld_cache_entry *fldt;
+
+ OBD_ALLOC_GFP(fldt, sizeof(*fldt), GFP_ATOMIC);
+ if (!fldt) {
+ OBD_FREE_PTR(f_new);
+ /* overlap is not allowed, so dont mess up list. */
+ return;
+ }
+ /* break f_curr RANGE into three RANGES:
+ * f_curr, f_new , fldt
+ */
+
+ /* f_new = *range */
+
+ /* fldt */
+ fldt->fce_range.lsr_start = new_end;
+ fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+ fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+ /* f_curr */
+ f_curr->fce_range.lsr_end = new_start;
+
+ /* add these two entries to list */
+ fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+ fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+ /* no need to fixup */
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+ struct fld_cache_entry *f_curr,
+ struct fld_cache_entry *f_new)
+{
+ const struct lu_seq_range *range = &f_new->fce_range;
+ const u64 new_start = range->lsr_start;
+ const u64 new_end = range->lsr_end;
+ const u32 mdt = range->lsr_index;
+
+ /* this is overlap case, these case are checking overlapping with
+ * prev range only. fixup will handle overlapping with next range. */
+
+ if (f_curr->fce_range.lsr_index == mdt) {
+ f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+ new_start);
+
+ f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+ new_end);
+
+ OBD_FREE_PTR(f_new);
+ fld_fix_new_list(cache);
+
+ } else if (new_start <= f_curr->fce_range.lsr_start &&
+ f_curr->fce_range.lsr_end <= new_end) {
+ /* case 1: new range completely overshadowed existing range.
+ * e.g. whole range migrated. update fld cache entry */
+
+ f_curr->fce_range = *range;
+ OBD_FREE_PTR(f_new);
+ fld_fix_new_list(cache);
+
+ } else if (f_curr->fce_range.lsr_start < new_start &&
+ new_end < f_curr->fce_range.lsr_end) {
+ /* case 2: new range fit within existing range. */
+
+ fld_cache_punch_hole(cache, f_curr, f_new);
+
+ } else if (new_end <= f_curr->fce_range.lsr_end) {
+ /* case 3: overlap:
+ * [new_start [c_start new_end) c_end)
+ */
+
+ LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+ f_curr->fce_range.lsr_start = new_end;
+ fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+ } else if (f_curr->fce_range.lsr_start <= new_start) {
+ /* case 4: overlap:
+ * [c_start [new_start c_end) new_end)
+ */
+
+ LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+ f_curr->fce_range.lsr_end = new_start;
+ fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+ } else
+ CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+ PRANGE(range), PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+ struct fld_cache_entry *f_new;
+
+ LASSERT(range_is_sane(range));
+
+ OBD_ALLOC_PTR(f_new);
+ if (!f_new)
+ return ERR_PTR(-ENOMEM);
+
+ f_new->fce_range = *range;
+ return f_new;
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+ struct fld_cache_entry *f_new)
+{
+ struct fld_cache_entry *f_curr;
+ struct fld_cache_entry *n;
+ struct list_head *head;
+ struct list_head *prev = NULL;
+ const u64 new_start = f_new->fce_range.lsr_start;
+ const u64 new_end = f_new->fce_range.lsr_end;
+ __u32 new_flags = f_new->fce_range.lsr_flags;
+
+ /*
+ * Duplicate entries are eliminated in insert op.
+ * So we don't need to search new entry before starting
+ * insertion loop.
+ */
+
+ if (!cache->fci_no_shrink)
+ fld_cache_shrink(cache);
+
+ head = &cache->fci_entries_head;
+
+ list_for_each_entry_safe(f_curr, n, head, fce_list) {
+ /* add list if next is end of list */
+ if (new_end < f_curr->fce_range.lsr_start ||
+ (new_end == f_curr->fce_range.lsr_start &&
+ new_flags != f_curr->fce_range.lsr_flags))
+ break;
+
+ prev = &f_curr->fce_list;
+ /* check if this range is to left of new range. */
+ if (new_start < f_curr->fce_range.lsr_end &&
+ new_flags == f_curr->fce_range.lsr_flags) {
+ fld_cache_overlap_handle(cache, f_curr, f_new);
+ goto out;
+ }
+ }
+
+ if (prev == NULL)
+ prev = head;
+
+ CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+ /* Add new entry to cache and lru list. */
+ fld_cache_entry_add(cache, f_new, prev);
+out:
+ return 0;
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+ const struct lu_seq_range *range)
+{
+ struct fld_cache_entry *flde;
+ int rc;
+
+ flde = fld_cache_entry_create(range);
+ if (IS_ERR(flde))
+ return PTR_ERR(flde);
+
+ write_lock(&cache->fci_lock);
+ rc = fld_cache_insert_nolock(cache, flde);
+ write_unlock(&cache->fci_lock);
+ if (rc)
+ OBD_FREE_PTR(flde);
+
+ return rc;
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+ const struct lu_seq_range *range)
+{
+ struct fld_cache_entry *flde;
+ struct fld_cache_entry *tmp;
+ struct list_head *head;
+
+ head = &cache->fci_entries_head;
+ list_for_each_entry_safe(flde, tmp, head, fce_list) {
+ /* add list if next is end of list */
+ if (range->lsr_start == flde->fce_range.lsr_start ||
+ (range->lsr_end == flde->fce_range.lsr_end &&
+ range->lsr_flags == flde->fce_range.lsr_flags)) {
+ fld_cache_entry_delete(cache, flde);
+ break;
+ }
+ }
+}
+
+/**
+ * Delete FLD entry in FLD cache.
+ *
+ */
+void fld_cache_delete(struct fld_cache *cache,
+ const struct lu_seq_range *range)
+{
+ write_lock(&cache->fci_lock);
+ fld_cache_delete_nolock(cache, range);
+ write_unlock(&cache->fci_lock);
+}
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+ struct lu_seq_range *range)
+{
+ struct fld_cache_entry *flde;
+ struct fld_cache_entry *got = NULL;
+ struct list_head *head;
+
+ head = &cache->fci_entries_head;
+ list_for_each_entry(flde, head, fce_list) {
+ if (range->lsr_start == flde->fce_range.lsr_start ||
+ (range->lsr_end == flde->fce_range.lsr_end &&
+ range->lsr_flags == flde->fce_range.lsr_flags)) {
+ got = flde;
+ break;
+ }
+ }
+
+ return got;
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+struct fld_cache_entry
+*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range)
+{
+ struct fld_cache_entry *got = NULL;
+
+ read_lock(&cache->fci_lock);
+ got = fld_cache_entry_lookup_nolock(cache, range);
+ read_unlock(&cache->fci_lock);
+ return got;
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+ const u64 seq, struct lu_seq_range *range)
+{
+ struct fld_cache_entry *flde;
+ struct fld_cache_entry *prev = NULL;
+ struct list_head *head;
+
+ read_lock(&cache->fci_lock);
+ head = &cache->fci_entries_head;
+
+ cache->fci_stat.fst_count++;
+ list_for_each_entry(flde, head, fce_list) {
+ if (flde->fce_range.lsr_start > seq) {
+ if (prev != NULL)
+ *range = prev->fce_range;
+ break;
+ }
+
+ prev = flde;
+ if (range_within(&flde->fce_range, seq)) {
+ *range = flde->fce_range;
+
+ cache->fci_stat.fst_cache++;
+ read_unlock(&cache->fci_lock);
+ return 0;
+ }
+ }
+ read_unlock(&cache->fci_lock);
+ return -ENOENT;
+}
diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h
new file mode 100644
index 000000000..68bec7658
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_internal.h
@@ -0,0 +1,193 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include "../include/lustre/lustre_idl.h"
+#include "../include/dt_object.h"
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+
+enum {
+ LUSTRE_FLD_INIT = 1 << 0,
+ LUSTRE_FLD_RUN = 1 << 1
+};
+
+struct fld_stats {
+ __u64 fst_count;
+ __u64 fst_cache;
+ __u64 fst_inflight;
+};
+
+typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64);
+
+typedef struct lu_fld_target *
+(*fld_scan_func_t) (struct lu_client_fld *, __u64);
+
+struct lu_fld_hash {
+ const char *fh_name;
+ fld_hash_func_t fh_hash_func;
+ fld_scan_func_t fh_scan_func;
+};
+
+struct fld_cache_entry {
+ struct list_head fce_lru;
+ struct list_head fce_list;
+ /**
+ * fld cache entries are sorted on range->lsr_start field. */
+ struct lu_seq_range fce_range;
+};
+
+struct fld_cache {
+ /**
+ * Cache guard, protects fci_hash mostly because others immutable after
+ * init is finished.
+ */
+ rwlock_t fci_lock;
+
+ /**
+ * Cache shrink threshold */
+ int fci_threshold;
+
+ /**
+ * Preferred number of cached entries */
+ int fci_cache_size;
+
+ /**
+ * Current number of cached entries. Protected by \a fci_lock */
+ int fci_cache_count;
+
+ /**
+ * LRU list fld entries. */
+ struct list_head fci_lru;
+
+ /**
+ * sorted fld entries. */
+ struct list_head fci_entries_head;
+
+ /**
+ * Cache statistics. */
+ struct fld_stats fci_stat;
+
+ /**
+ * Cache name used for debug and messages. */
+ char fci_name[LUSTRE_MDT_MAXNAMELEN];
+ unsigned int fci_no_shrink:1;
+};
+
+enum fld_op {
+ FLD_CREATE = 0,
+ FLD_DELETE = 1,
+ FLD_LOOKUP = 2
+};
+
+enum {
+ /* 4M of FLD cache will not hurt client a lot. */
+ FLD_SERVER_CACHE_SIZE = (4 * 0x100000),
+
+ /* 1M of FLD cache will not hurt client a lot. */
+ FLD_CLIENT_CACHE_SIZE = (1 * 0x100000)
+};
+
+enum {
+ /* Cache threshold is 10 percent of size. */
+ FLD_SERVER_CACHE_THRESHOLD = 10,
+
+ /* Cache threshold is 10 percent of size. */
+ FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+int fld_client_rpc(struct obd_export *exp,
+ struct lu_seq_range *range, __u32 fld_op);
+
+#if defined(CONFIG_PROC_FS)
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
+
+struct fld_cache *fld_cache_init(const char *name,
+ int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+ const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+ struct fld_cache_entry *f_new);
+void fld_cache_delete(struct fld_cache *cache,
+ const struct lu_seq_range *range);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+ const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+ const u64 seq, struct lu_seq_range *range);
+
+struct fld_cache_entry*
+fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range);
+void fld_cache_entry_delete(struct fld_cache *cache,
+ struct fld_cache_entry *node);
+void fld_dump_cache_entries(struct fld_cache *cache);
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+ struct lu_seq_range *range);
+int fld_write_range(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_seq_range *range, struct thandle *th);
+
+static inline const char *
+fld_target_name(struct lu_fld_target *tar)
+{
+ if (tar->ft_srv != NULL)
+ return tar->ft_srv->lsf_name;
+
+ return (const char *)tar->ft_exp->exp_obd->obd_name;
+}
+
+#endif /* __FLD_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c
new file mode 100644
index 000000000..6ac225e90
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/fld_request.c
@@ -0,0 +1,526 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+#include <asm/div64.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_ver.h"
+#include "../include/obd_support.h"
+#include "../include/lprocfs_status.h"
+
+#include "../include/dt_object.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+#include "../include/lustre_mdc.h"
+#include "fld_internal.h"
+
+/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c
+ * It should be common thing. The same about mdc RPC lock */
+static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+ int rc;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ rc = list_empty(&mcw->mcw_entry);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ return rc;
+};
+
+static void fld_enter_request(struct client_obd *cli)
+{
+ struct mdc_cache_waiter mcw;
+ struct l_wait_info lwi = { 0 };
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+ list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+ init_waitqueue_head(&mcw.mcw_waitq);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi);
+ } else {
+ cli->cl_r_in_flight++;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ }
+}
+
+static void fld_exit_request(struct client_obd *cli)
+{
+ struct list_head *l, *tmp;
+ struct mdc_cache_waiter *mcw;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_r_in_flight--;
+ list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+
+ if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+ /* No free request slots anymore */
+ break;
+ }
+
+ mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+ list_del_init(&mcw->mcw_entry);
+ cli->cl_r_in_flight++;
+ wake_up(&mcw->mcw_waitq);
+ }
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq)
+{
+ LASSERT(fld->lcf_count > 0);
+ return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, u64 seq)
+{
+ struct lu_fld_target *target;
+ int hash;
+
+ /* Because almost all of special sequence located in MDT0,
+ * it should go to index 0 directly, instead of calculating
+ * hash again, and also if other MDTs is not being connected,
+ * the fld lookup requests(for seq on MDT0) should not be
+ * blocked because of other MDTs */
+ if (fid_seq_is_norm(seq))
+ hash = fld_rrb_hash(fld, seq);
+ else
+ hash = 0;
+
+again:
+ list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+ if (target->ft_idx == hash)
+ return target;
+ }
+
+ if (hash != 0) {
+ /* It is possible the remote target(MDT) are not connected to
+ * with client yet, so we will refer this to MDT0, which should
+ * be connected during mount */
+ hash = 0;
+ goto again;
+ }
+
+ CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n",
+ fld->lcf_name, hash, seq, fld->lcf_count);
+
+ list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+ const char *srv_name = target->ft_srv != NULL ?
+ target->ft_srv->lsf_name : "<null>";
+ const char *exp_name = target->ft_exp != NULL ?
+ (char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+ "<null>";
+
+ CERROR(" exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n",
+ target->ft_exp, exp_name, target->ft_srv,
+ srv_name, target->ft_idx);
+ }
+
+ /*
+ * If target is not found, there is logical error anyway, so here is
+ * LBUG() to catch this situation.
+ */
+ LBUG();
+ return NULL;
+}
+
+struct lu_fld_hash fld_hash[] = {
+ {
+ .fh_name = "RRB",
+ .fh_hash_func = fld_rrb_hash,
+ .fh_scan_func = fld_rrb_scan
+ },
+ {
+ NULL,
+ }
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, u64 seq)
+{
+ struct lu_fld_target *target;
+
+ LASSERT(fld->lcf_hash != NULL);
+
+ spin_lock(&fld->lcf_lock);
+ target = fld->lcf_hash->fh_scan_func(fld, seq);
+ spin_unlock(&fld->lcf_lock);
+
+ if (target != NULL) {
+ CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n",
+ fld->lcf_name, target->ft_idx, seq);
+ }
+
+ return target;
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+ struct lu_fld_target *tar)
+{
+ const char *name;
+ struct lu_fld_target *target, *tmp;
+
+ LASSERT(tar != NULL);
+ name = fld_target_name(tar);
+ LASSERT(name != NULL);
+ LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+ if (fld->lcf_flags != LUSTRE_FLD_INIT) {
+ CERROR("%s: Attempt to add target %s (idx %llu) on fly - skip it\n",
+ fld->lcf_name, name, tar->ft_idx);
+ return 0;
+ }
+ CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n",
+ fld->lcf_name, name, tar->ft_idx);
+
+ OBD_ALLOC_PTR(target);
+ if (target == NULL)
+ return -ENOMEM;
+
+ spin_lock(&fld->lcf_lock);
+ list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+ if (tmp->ft_idx == tar->ft_idx) {
+ spin_unlock(&fld->lcf_lock);
+ OBD_FREE_PTR(target);
+ CERROR("Target %s exists in FLD and known as %s:#%llu\n",
+ name, fld_target_name(tmp), tmp->ft_idx);
+ return -EEXIST;
+ }
+ }
+
+ target->ft_exp = tar->ft_exp;
+ if (target->ft_exp != NULL)
+ class_export_get(target->ft_exp);
+ target->ft_srv = tar->ft_srv;
+ target->ft_idx = tar->ft_idx;
+
+ list_add_tail(&target->ft_chain,
+ &fld->lcf_targets);
+
+ fld->lcf_count++;
+ spin_unlock(&fld->lcf_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+{
+ struct lu_fld_target *target, *tmp;
+
+ spin_lock(&fld->lcf_lock);
+ list_for_each_entry_safe(target, tmp,
+ &fld->lcf_targets, ft_chain) {
+ if (target->ft_idx == idx) {
+ fld->lcf_count--;
+ list_del(&target->ft_chain);
+ spin_unlock(&fld->lcf_lock);
+
+ if (target->ft_exp != NULL)
+ class_export_put(target->ft_exp);
+
+ OBD_FREE_PTR(target);
+ return 0;
+ }
+ }
+ spin_unlock(&fld->lcf_lock);
+ return -ENOENT;
+}
+EXPORT_SYMBOL(fld_client_del_target);
+
+static struct proc_dir_entry *fld_type_proc_dir;
+
+#if defined(CONFIG_PROC_FS)
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+ int rc;
+
+ fld->lcf_proc_dir = lprocfs_register(fld->lcf_name,
+ fld_type_proc_dir,
+ NULL, NULL);
+
+ if (IS_ERR(fld->lcf_proc_dir)) {
+ CERROR("%s: LProcFS failed in fld-init\n",
+ fld->lcf_name);
+ rc = PTR_ERR(fld->lcf_proc_dir);
+ return rc;
+ }
+
+ rc = lprocfs_add_vars(fld->lcf_proc_dir,
+ fld_client_proc_list, fld);
+ if (rc) {
+ CERROR("%s: Can't init FLD proc, rc %d\n",
+ fld->lcf_name, rc);
+ goto out_cleanup;
+ }
+
+ return 0;
+
+out_cleanup:
+ fld_client_proc_fini(fld);
+ return rc;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+ if (fld->lcf_proc_dir) {
+ if (!IS_ERR(fld->lcf_proc_dir))
+ lprocfs_remove(&fld->lcf_proc_dir);
+ fld->lcf_proc_dir = NULL;
+ }
+}
+#else
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+ return 0;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+}
+#endif
+EXPORT_SYMBOL(fld_client_proc_fini);
+
+static inline int hash_is_sane(int hash)
+{
+ return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+ const char *prefix, int hash)
+{
+ int cache_size, cache_threshold;
+ int rc;
+
+ LASSERT(fld != NULL);
+
+ snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+ "cli-%s", prefix);
+
+ if (!hash_is_sane(hash)) {
+ CERROR("%s: Wrong hash function %#x\n",
+ fld->lcf_name, hash);
+ return -EINVAL;
+ }
+
+ fld->lcf_count = 0;
+ spin_lock_init(&fld->lcf_lock);
+ fld->lcf_hash = &fld_hash[hash];
+ fld->lcf_flags = LUSTRE_FLD_INIT;
+ INIT_LIST_HEAD(&fld->lcf_targets);
+
+ cache_size = FLD_CLIENT_CACHE_SIZE /
+ sizeof(struct fld_cache_entry);
+
+ cache_threshold = cache_size *
+ FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+ fld->lcf_cache = fld_cache_init(fld->lcf_name,
+ cache_size, cache_threshold);
+ if (IS_ERR(fld->lcf_cache)) {
+ rc = PTR_ERR(fld->lcf_cache);
+ fld->lcf_cache = NULL;
+ goto out;
+ }
+
+ rc = fld_client_proc_init(fld);
+ if (rc)
+ goto out;
+out:
+ if (rc)
+ fld_client_fini(fld);
+ else
+ CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+ fld->lcf_name, fld->lcf_hash->fh_name);
+ return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+ struct lu_fld_target *target, *tmp;
+
+ spin_lock(&fld->lcf_lock);
+ list_for_each_entry_safe(target, tmp,
+ &fld->lcf_targets, ft_chain) {
+ fld->lcf_count--;
+ list_del(&target->ft_chain);
+ if (target->ft_exp != NULL)
+ class_export_put(target->ft_exp);
+ OBD_FREE_PTR(target);
+ }
+ spin_unlock(&fld->lcf_lock);
+
+ if (fld->lcf_cache != NULL) {
+ if (!IS_ERR(fld->lcf_cache))
+ fld_cache_fini(fld->lcf_cache);
+ fld->lcf_cache = NULL;
+ }
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+ struct lu_seq_range *range, __u32 fld_op)
+{
+ struct ptlrpc_request *req;
+ struct lu_seq_range *prange;
+ __u32 *op;
+ int rc;
+ struct obd_import *imp;
+
+ LASSERT(exp != NULL);
+
+ imp = class_exp2cliimp(exp);
+ req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION,
+ FLD_QUERY);
+ if (req == NULL)
+ return -ENOMEM;
+
+ op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+ *op = fld_op;
+
+ prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+ *prange = *range;
+
+ ptlrpc_request_set_replen(req);
+ req->rq_request_portal = FLD_REQUEST_PORTAL;
+ req->rq_reply_portal = MDC_REPLY_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
+ if (fld_op == FLD_LOOKUP &&
+ imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS)
+ req->rq_allow_replay = 1;
+
+ if (fld_op != FLD_LOOKUP)
+ mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+ fld_enter_request(&exp->exp_obd->u.cli);
+ rc = ptlrpc_queue_wait(req);
+ fld_exit_request(&exp->exp_obd->u.cli);
+ if (fld_op != FLD_LOOKUP)
+ mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+ if (rc)
+ goto out_req;
+
+ prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD);
+ if (prange == NULL) {
+ rc = -EFAULT;
+ goto out_req;
+ }
+ *range = *prange;
+out_req:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+ __u32 flags, const struct lu_env *env)
+{
+ struct lu_seq_range res = { 0 };
+ struct lu_fld_target *target;
+ int rc;
+
+ fld->lcf_flags |= LUSTRE_FLD_RUN;
+
+ rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+ if (rc == 0) {
+ *mds = res.lsr_index;
+ return 0;
+ }
+
+ /* Can not find it in the cache */
+ target = fld_client_get_target(fld, seq);
+ LASSERT(target != NULL);
+
+ CDEBUG(D_INFO, "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n",
+ fld->lcf_name, seq, fld_target_name(target), target->ft_idx);
+
+ res.lsr_start = seq;
+ fld_range_set_type(&res, flags);
+ rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP);
+
+ if (rc == 0) {
+ *mds = res.lsr_index;
+
+ fld_cache_insert(fld->lcf_cache, &res);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+ fld_cache_flush(fld->lcf_cache);
+}
+EXPORT_SYMBOL(fld_client_flush);
+
+static int __init fld_mod_init(void)
+{
+ fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+ proc_lustre_root,
+ NULL, NULL);
+ return PTR_ERR_OR_ZERO(fld_type_proc_dir);
+}
+
+static void __exit fld_mod_exit(void)
+{
+ if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+ lprocfs_remove(&fld_type_proc_dir);
+ fld_type_proc_dir = NULL;
+ }
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FLD");
+MODULE_LICENSE("GPL");
+
+module_init(fld_mod_init)
+module_exit(fld_mod_exit)
diff --git a/drivers/staging/lustre/lustre/fld/lproc_fld.c b/drivers/staging/lustre/lustre/fld/lproc_fld.c
new file mode 100644
index 000000000..f53fdcfae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/fld/lproc_fld.c
@@ -0,0 +1,172 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/module.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/dt_object.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_fld.h"
+#include "../include/lustre_fid.h"
+#include "fld_internal.h"
+
+static int
+fld_proc_targets_seq_show(struct seq_file *m, void *unused)
+{
+ struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+ struct lu_fld_target *target;
+
+ LASSERT(fld != NULL);
+
+ spin_lock(&fld->lcf_lock);
+ list_for_each_entry(target,
+ &fld->lcf_targets, ft_chain)
+ seq_printf(m, "%s\n", fld_target_name(target));
+ spin_unlock(&fld->lcf_lock);
+
+ return 0;
+}
+
+static int
+fld_proc_hash_seq_show(struct seq_file *m, void *unused)
+{
+ struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+
+ LASSERT(fld != NULL);
+
+ spin_lock(&fld->lcf_lock);
+ seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
+ spin_unlock(&fld->lcf_lock);
+
+ return 0;
+}
+
+static ssize_t
+fld_proc_hash_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct lu_client_fld *fld;
+ struct lu_fld_hash *hash = NULL;
+ char fh_name[8];
+ int i;
+
+ if (count > sizeof(fh_name))
+ return -ENAMETOOLONG;
+
+ if (copy_from_user(fh_name, buffer, count) != 0)
+ return -EFAULT;
+
+ fld = ((struct seq_file *)file->private_data)->private;
+ LASSERT(fld != NULL);
+
+ for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+ if (count != strlen(fld_hash[i].fh_name))
+ continue;
+
+ if (!strncmp(fld_hash[i].fh_name, fh_name, count)) {
+ hash = &fld_hash[i];
+ break;
+ }
+ }
+
+ if (hash != NULL) {
+ spin_lock(&fld->lcf_lock);
+ fld->lcf_hash = hash;
+ spin_unlock(&fld->lcf_lock);
+
+ CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+ fld->lcf_name, hash->fh_name);
+ }
+
+ return count;
+}
+
+static ssize_t
+fld_proc_cache_flush_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *pos)
+{
+ struct lu_client_fld *fld = file->private_data;
+
+ LASSERT(fld != NULL);
+
+ fld_cache_flush(fld->lcf_cache);
+
+ CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+ return count;
+}
+
+static int fld_proc_cache_flush_open(struct inode *inode, struct file *file)
+{
+ file->private_data = PDE_DATA(inode);
+ return 0;
+}
+
+static int fld_proc_cache_flush_release(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return 0;
+}
+
+static struct file_operations fld_proc_cache_flush_fops = {
+ .owner = THIS_MODULE,
+ .open = fld_proc_cache_flush_open,
+ .write = fld_proc_cache_flush_write,
+ .release = fld_proc_cache_flush_release,
+};
+
+LPROC_SEQ_FOPS_RO(fld_proc_targets);
+LPROC_SEQ_FOPS(fld_proc_hash);
+
+struct lprocfs_vars fld_client_proc_list[] = {
+ { "targets", &fld_proc_targets_fops },
+ { "hash", &fld_proc_hash_fops },
+ { "cache_flush", &fld_proc_cache_flush_fops },
+ { NULL }
+};
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
new file mode 100644
index 000000000..d56c8bea8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/cl_object.h
@@ -0,0 +1,3287 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ * - cl_object
+ *
+ * - cl_page
+ *
+ * - cl_lock represents an extent lock on an object.
+ *
+ * - cl_io represents high-level i/o activity such as whole read/write
+ * system call, or write-out of pages from under the lock being
+ * canceled. cl_io has sub-ios that can be stopped and resumed
+ * independently, thus achieving high degree of transfer
+ * parallelism. Single cl_io can be advanced forward by
+ * the multiple threads (although in the most usual case of
+ * read/write system call it is associated with the single user
+ * thread, that issued the system call).
+ *
+ * - cl_req represents a collection of pages for a transfer. cl_req is
+ * constructed by req-forming engine that tries to saturate
+ * transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ * - to avoid confusion high-level I/O operation like read or write system
+ * call is referred to as "an io", whereas low-level I/O operation, like
+ * RPC, is referred to as "a transfer"
+ *
+ * - "generic code" means generic (not file system specific) code in the
+ * hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ * is not layer specific.
+ *
+ * Locking.
+ *
+ * - i_mutex
+ * - PG_locked
+ * - cl_object_header::coh_page_guard
+ * - cl_object_header::coh_lock_guard
+ * - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include "lu_object.h"
+#include "linux/lustre_compat25.h"
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+ /**
+ * Initialize cl_req. This method is called top-to-bottom on all
+ * devices in the stack to get them a chance to allocate layer-private
+ * data, and to attach them to the cl_req by calling
+ * cl_req_slice_add().
+ *
+ * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+ * \see ccc_req_init()
+ */
+ int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+ /** Super-class. */
+ struct lu_device cd_lu_dev;
+ /** Per-layer operation vector. */
+ const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+ /** Object size, in bytes */
+ loff_t cat_size;
+ /**
+ * Known minimal size, in bytes.
+ *
+ * This is only valid when at least one DLM lock is held.
+ */
+ loff_t cat_kms;
+ /** Modification time. Measured in seconds since epoch. */
+ time_t cat_mtime;
+ /** Access time. Measured in seconds since epoch. */
+ time_t cat_atime;
+ /** Change time. Measured in seconds since epoch. */
+ time_t cat_ctime;
+ /**
+ * Blocks allocated to this cl_object on the server file system.
+ *
+ * \todo XXX An interface for block size is needed.
+ */
+ __u64 cat_blocks;
+ /**
+ * User identifier for quota purposes.
+ */
+ uid_t cat_uid;
+ /**
+ * Group identifier for quota purposes.
+ */
+ gid_t cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+ CAT_SIZE = 1 << 0,
+ CAT_KMS = 1 << 1,
+ CAT_MTIME = 1 << 3,
+ CAT_ATIME = 1 << 4,
+ CAT_CTIME = 1 << 5,
+ CAT_BLOCKS = 1 << 6,
+ CAT_UID = 1 << 7,
+ CAT_GID = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ * stripe. cl_object is based on lu_object: it is identified by a fid,
+ * layered, cached, hashed, and lrued. Important distinction with the server
+ * side, where md_object and dt_object are used, is that cl_object "fans out"
+ * at the lov/sns level: depending on the file layout, single file is
+ * represented as a set of "sub-objects" (stripes). At the implementation
+ * level, struct lov_object contains an array of cl_objects. Each sub-object
+ * is a full-fledged cl_object, having its fid, living in the lru and hash
+ * table.
+ *
+ * This leads to the next important difference with the server side: on the
+ * client, it's quite usual to have objects with the different sequence of
+ * layers. For example, typical top-object is composed of the following
+ * layers:
+ *
+ * - vvp
+ * - lov
+ *
+ * whereas its sub-objects are composed of
+ *
+ * - lovsub
+ * - osc
+ *
+ * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ * track of the object-subobject relationship.
+ *
+ * Sub-objects are not cached independently: when top-object is about to
+ * be discarded from the memory, all its sub-objects are torn-down and
+ * destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+ /** super class */
+ struct lu_object co_lu;
+ /** per-object-layer operations */
+ const struct cl_object_operations *co_ops;
+ /** offset of page slice in cl_page buffer */
+ int co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+ /** Super-class. */
+ struct lu_object_conf coc_lu;
+ union {
+ /**
+ * Object layout. This is consumed by lov.
+ */
+ struct lustre_md *coc_md;
+ /**
+ * Description of particular stripe location in the
+ * cluster. This is consumed by osc.
+ */
+ struct lov_oinfo *coc_oinfo;
+ } u;
+ /**
+ * VFS inode. This is consumed by vvp.
+ */
+ struct inode *coc_inode;
+ /**
+ * Layout lock handle.
+ */
+ struct ldlm_lock *coc_lock;
+ /**
+ * Operation to handle layout, OBJECT_CONF_XYZ.
+ */
+ int coc_opc;
+};
+
+enum {
+ /** configure layout, set up a new stripe, must be called while
+ * holding layout lock. */
+ OBJECT_CONF_SET = 0,
+ /** invalidate the current stripe configuration due to losing
+ * layout lock. */
+ OBJECT_CONF_INVALIDATE = 1,
+ /** wait for old layout to go away so that new layout can be
+ * set up. */
+ OBJECT_CONF_WAIT = 2
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+ /**
+ * Initialize page slice for this layer. Called top-to-bottom through
+ * every object layer when a new cl_page is instantiated. Layer
+ * keeping private per-page data, or requiring its own page operations
+ * vector should allocate these data here, and attach then to the page
+ * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+ * sense). Optional.
+ *
+ * \retval NULL success.
+ *
+ * \retval ERR_PTR(errno) failure code.
+ *
+ * \retval valid-pointer pointer to already existing referenced page
+ * to be used instead of newly created.
+ */
+ int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage);
+ /**
+ * Initialize lock slice for this layer. Called top-to-bottom through
+ * every object layer when a new cl_lock is instantiated. Layer
+ * keeping private per-lock data, or requiring its own lock operations
+ * vector should allocate these data here, and attach then to the lock
+ * by calling cl_lock_slice_add(). Mandatory.
+ */
+ int (*coo_lock_init)(const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *io);
+ /**
+ * Initialize io state for a given layer.
+ *
+ * called top-to-bottom once per io existence to initialize io
+ * state. If layer wants to keep some state for this type of io, it
+ * has to embed struct cl_io_slice in lu_env::le_ses, and register
+ * slice with cl_io_slice_add(). It is guaranteed that all threads
+ * participating in this io share the same session.
+ */
+ int (*coo_io_init)(const struct lu_env *env,
+ struct cl_object *obj, struct cl_io *io);
+ /**
+ * Fill portion of \a attr that this layer controls. This method is
+ * called top-to-bottom through all object layers.
+ *
+ * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+ *
+ * \return 0: to continue
+ * \return +ve: to stop iterating through layers (but 0 is returned
+ * from enclosing cl_object_attr_get())
+ * \return -ve: to signal error
+ */
+ int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr);
+ /**
+ * Update attributes.
+ *
+ * \a valid is a bitmask composed from enum #cl_attr_valid, and
+ * indicating what attributes are to be set.
+ *
+ * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+ *
+ * \return the same convention as for
+ * cl_object_operations::coo_attr_get() is used.
+ */
+ int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid);
+ /**
+ * Update object configuration. Called top-to-bottom to modify object
+ * configuration.
+ *
+ * XXX error conditions and handling.
+ */
+ int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf);
+ /**
+ * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+ * object. Layers are supposed to fill parts of \a lvb that will be
+ * shipped to the glimpse originator as a glimpse result.
+ *
+ * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+ * \see osc_object_glimpse()
+ */
+ int (*coo_glimpse)(const struct lu_env *env,
+ const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+ /** Standard lu_object_header. cl_object::co_lu::lo_header points
+ * here. */
+ struct lu_object_header coh_lu;
+ /** \name locks
+ * \todo XXX move locks below to the separate cache-lines, they are
+ * mostly useless otherwise.
+ */
+ /** @{ */
+ /** Lock protecting page tree. */
+ spinlock_t coh_page_guard;
+ /** Lock protecting lock list. */
+ spinlock_t coh_lock_guard;
+ /** @} locks */
+ /** Radix tree of cl_page's, cached for this object. */
+ struct radix_tree_root coh_tree;
+ /** # of pages in radix tree. */
+ unsigned long coh_pages;
+ /** List of cl_lock's granted for this object. */
+ struct list_head coh_locks;
+
+ /**
+ * Parent object. It is assumed that an object has a well-defined
+ * parent, but not a well-defined child (there may be multiple
+ * sub-objects, for the same top-object). cl_object_header::coh_parent
+ * field allows certain code to be written generically, without
+ * limiting possible cl_object layouts unduly.
+ */
+ struct cl_object_header *coh_parent;
+ /**
+ * Protects consistency between cl_attr of parent object and
+ * attributes of sub-objects, that the former is calculated ("merged")
+ * from.
+ *
+ * \todo XXX this can be read/write lock if needed.
+ */
+ spinlock_t coh_attr_guard;
+ /**
+ * Size of cl_page + page slices
+ */
+ unsigned short coh_page_bufsize;
+ /**
+ * Number of objects above this one: 0 for a top-object, 1 for its
+ * sub-object, etc.
+ */
+ unsigned char coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj) \
+ list_for_each_entry((slice), \
+ &(obj)->co_lu.lo_header->loh_layers, \
+ co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj) \
+ list_for_each_entry_reverse((slice), \
+ &(obj)->co_lu.lo_header->loh_layers, \
+ co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ * of the given file are of the same size, and are kept in the radix tree
+ * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ * of the top-level file object are first class cl_objects, they have their
+ * own radix trees of pages and hence page is implemented as a sequence of
+ * struct cl_pages's, linked into double-linked list through
+ * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ * corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ * page in Linux kernel, for example), struct page. It is assumed, that this
+ * association is implemented by one of cl_page layers (top layer in the
+ * current design) that
+ *
+ * - intercepts per-VM-page call-backs made by the environment (e.g.,
+ * memory pressure),
+ *
+ * - translates state (page flag bits) and locking between lustre and
+ * environment.
+ *
+ * The association between cl_page and struct page is immutable and
+ * established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ * this io an exclusive access to this page w.r.t. other io attempts and
+ * various events changing page state (such as transfer completion, or
+ * eviction of the page from the memory). Note, that in general cl_io
+ * cannot be identified with a particular thread, and page ownership is not
+ * exactly equal to the current thread holding a lock on the page. Layer
+ * implementing association between cl_page and struct page has to implement
+ * ownership on top of available synchronization mechanisms.
+ *
+ * While lustre client maintains the notion of an page ownership by io,
+ * hosting MM/VM usually has its own page concurrency control
+ * mechanisms. For example, in Linux, page access is synchronized by the
+ * per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ * takes care to acquire and release such locks as necessary around the
+ * calls to the file system methods (->readpage(), ->prepare_write(),
+ * ->commit_write(), etc.). This leads to the situation when there are two
+ * different ways to own a page in the client:
+ *
+ * - client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ * - VM locks a page and then calls the client, that has "to assume"
+ * the ownership from the VM (cl_page_assume()).
+ *
+ * Dual methods to release ownership are cl_page_disown() and
+ * cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ * drops to 0, the page is returned to the cache, unless it is in
+ * cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * The general logic guaranteeing the absence of "existential races" for
+ * pages is the following:
+ *
+ * - there are fixed known ways for a thread to obtain a new reference
+ * to a page:
+ *
+ * - by doing a lookup in the cl_object radix tree, protected by the
+ * spin-lock;
+ *
+ * - by starting from VM-locked struct page and following some
+ * hosting environment method (e.g., following ->private pointer in
+ * the case of Linux kernel), see cl_vmpage_page();
+ *
+ * - when the page enters cl_page_state::CPS_FREEING state, all these
+ * ways are severed with the proper synchronization
+ * (cl_page_delete());
+ *
+ * - entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ * lock;
+ *
+ * - no new references to the page in cl_page_state::CPS_FREEING state
+ * are allowed (checked in cl_page_get()).
+ *
+ * Together this guarantees that when last reference to a
+ * cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ * page, as neither references to it can be acquired at that point, nor
+ * ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ * cl_page_state. Possible state transitions are enumerated in
+ * cl_page_state_set(). State transition process (i.e., actual changing of
+ * cl_page::cp_state field) is protected by the lock on the underlying VM
+ * page.
+ *
+ * Linux Kernel implementation.
+ *
+ * Binding between cl_page and struct page (which is a typedef for
+ * struct page) is implemented in the vvp layer. cl_page is attached to the
+ * ->private pointer of the struct page, together with the setting of
+ * PG_private bit in page->flags, and acquiring additional reference on the
+ * struct page (much like struct buffer_head, or any similar file system
+ * private data structures).
+ *
+ * PG_locked lock is used to implement both ownership and transfer
+ * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ * states. No additional references are acquired for the duration of the
+ * transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ * write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+ /**
+ * Page is in the cache, un-owned. Page leaves cached state in the
+ * following cases:
+ *
+ * - [cl_page_state::CPS_OWNED] io comes across the page and
+ * owns it;
+ *
+ * - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+ * req-formation engine decides that it wants to include this page
+ * into an cl_req being constructed, and yanks it from the cache;
+ *
+ * - [cl_page_state::CPS_FREEING] VM callback is executed to
+ * evict the page form the memory;
+ *
+ * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+ */
+ CPS_CACHED,
+ /**
+ * Page is exclusively owned by some cl_io. Page may end up in this
+ * state as a result of
+ *
+ * - io creating new page and immediately owning it;
+ *
+ * - [cl_page_state::CPS_CACHED] io finding existing cached page
+ * and owning it;
+ *
+ * - [cl_page_state::CPS_OWNED] io finding existing owned page
+ * and waiting for owner to release the page;
+ *
+ * Page leaves owned state in the following cases:
+ *
+ * - [cl_page_state::CPS_CACHED] io decides to leave the page in
+ * the cache, doing nothing;
+ *
+ * - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+ * this page;
+ *
+ * - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+ * transfer for this page;
+ *
+ * - [cl_page_state::CPS_FREEING] io decides to destroy this
+ * page (e.g., as part of truncate or extent lock cancellation).
+ *
+ * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+ */
+ CPS_OWNED,
+ /**
+ * Page is being written out, as a part of a transfer. This state is
+ * entered when req-formation logic decided that it wants this page to
+ * be sent through the wire _now_. Specifically, it means that once
+ * this state is achieved, transfer completion handler (with either
+ * success or failure indication) is guaranteed to be executed against
+ * this page independently of any locks and any scheduling decisions
+ * made by the hosting environment (that effectively means that the
+ * page is never put into cl_page_state::CPS_PAGEOUT state "in
+ * advance". This property is mentioned, because it is important when
+ * reasoning about possible dead-locks in the system). The page can
+ * enter this state as a result of
+ *
+ * - [cl_page_state::CPS_OWNED] an io requesting an immediate
+ * write-out of this page, or
+ *
+ * - [cl_page_state::CPS_CACHED] req-forming engine deciding
+ * that it has enough dirty pages cached to issue a "good"
+ * transfer.
+ *
+ * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+ * is completed---it is moved into cl_page_state::CPS_CACHED state.
+ *
+ * Underlying VM page is locked for the duration of transfer.
+ *
+ * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+ */
+ CPS_PAGEOUT,
+ /**
+ * Page is being read in, as a part of a transfer. This is quite
+ * similar to the cl_page_state::CPS_PAGEOUT state, except that
+ * read-in is always "immediate"---there is no such thing a sudden
+ * construction of read cl_req from cached, presumably not up to date,
+ * pages.
+ *
+ * Underlying VM page is locked for the duration of transfer.
+ *
+ * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+ */
+ CPS_PAGEIN,
+ /**
+ * Page is being destroyed. This state is entered when client decides
+ * that page has to be deleted from its host object, as, e.g., a part
+ * of truncate.
+ *
+ * Once this state is reached, there is no way to escape it.
+ *
+ * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+ */
+ CPS_FREEING,
+ CPS_NR
+};
+
+enum cl_page_type {
+ /** Host page, the page is from the host inode which the cl_page
+ * belongs to. */
+ CPT_CACHEABLE = 1,
+
+ /** Transient page, the transient cl_page is used to bind a cl_page
+ * to vmpage which is not belonging to the same object of cl_page.
+ * it is used in DirectIO, lockless IO and liblustre. */
+ CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+ /**
+ * Set when pagein completes. Used for debugging (read completes at
+ * most once for a page).
+ */
+ CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+ /** Reference counter. */
+ atomic_t cp_ref;
+ /** An object this page is a part of. Immutable after creation. */
+ struct cl_object *cp_obj;
+ /** Logical page index within the object. Immutable after creation. */
+ pgoff_t cp_index;
+ /** List of slices. Immutable after creation. */
+ struct list_head cp_layers;
+ /** Parent page, NULL for top-level page. Immutable after creation. */
+ struct cl_page *cp_parent;
+ /** Lower-layer page. NULL for bottommost page. Immutable after
+ * creation. */
+ struct cl_page *cp_child;
+ /**
+ * Page state. This field is const to avoid accidental update, it is
+ * modified only internally within cl_page.c. Protected by a VM lock.
+ */
+ const enum cl_page_state cp_state;
+ /** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+ struct list_head cp_batch;
+ /** Mutex serializing membership of a page in a batch. */
+ struct mutex cp_mutex;
+ /** Linkage of pages within cl_req. */
+ struct list_head cp_flight;
+ /** Transfer error. */
+ int cp_error;
+
+ /**
+ * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+ * creation.
+ */
+ enum cl_page_type cp_type;
+
+ /**
+ * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+ * by sub-io. Protected by a VM lock.
+ */
+ struct cl_io *cp_owner;
+ /**
+ * Debug information, the task is owning the page.
+ */
+ struct task_struct *cp_task;
+ /**
+ * Owning IO request in cl_page_state::CPS_PAGEOUT and
+ * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+ * the top-level pages. Protected by a VM lock.
+ */
+ struct cl_req *cp_req;
+ /** List of references to this page, for debugging. */
+ struct lu_ref cp_reference;
+ /** Link to an object, for debugging. */
+ struct lu_ref_link cp_obj_ref;
+ /** Link to a queue, for debugging. */
+ struct lu_ref_link cp_queue_ref;
+ /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+ unsigned cp_flags;
+ /** Assigned if doing a sync_io */
+ struct cl_sync_io *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+ struct cl_page *cpl_page;
+ /**
+ * Object slice corresponding to this page slice. Immutable after
+ * creation.
+ */
+ struct cl_object *cpl_obj;
+ const struct cl_page_operations *cpl_ops;
+ /** Linkage into cl_page::cp_layers. Immutable after creation. */
+ struct list_head cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+ /**
+ * Mode of a lock that protects no data, and exists only as a
+ * placeholder. This is used for `glimpse' requests. A phantom lock
+ * might get promoted to real lock at some point.
+ */
+ CLM_PHANTOM,
+ CLM_READ,
+ CLM_WRITE,
+ CLM_GROUP
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+ CRT_READ,
+ CRT_WRITE,
+ CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+ /**
+ * cl_page<->struct page methods. Only one layer in the stack has to
+ * implement these. Current code assumes that this functionality is
+ * provided by the topmost layer, see cl_page_disown0() as an example.
+ */
+
+ /**
+ * \return the underlying VM page. Optional.
+ */
+ struct page *(*cpo_vmpage)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Called when \a io acquires this page into the exclusive
+ * ownership. When this method returns, it is guaranteed that the is
+ * not owned by other io, and no transfer is going on against
+ * it. Optional.
+ *
+ * \see cl_page_own()
+ * \see vvp_page_own(), lov_page_own()
+ */
+ int (*cpo_own)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io, int nonblock);
+ /** Called when ownership it yielded. Optional.
+ *
+ * \see cl_page_disown()
+ * \see vvp_page_disown()
+ */
+ void (*cpo_disown)(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+ /**
+ * Called for a page that is already "owned" by \a io from VM point of
+ * view. Optional.
+ *
+ * \see cl_page_assume()
+ * \see vvp_page_assume(), lov_page_assume()
+ */
+ void (*cpo_assume)(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+ /** Dual to cl_page_operations::cpo_assume(). Optional. Called
+ * bottom-to-top when IO releases a page without actually unlocking
+ * it.
+ *
+ * \see cl_page_unassume()
+ * \see vvp_page_unassume()
+ */
+ void (*cpo_unassume)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /**
+ * Announces whether the page contains valid data or not by \a uptodate.
+ *
+ * \see cl_page_export()
+ * \see vvp_page_export()
+ */
+ void (*cpo_export)(const struct lu_env *env,
+ const struct cl_page_slice *slice, int uptodate);
+ /**
+ * Unmaps page from the user space (if it is mapped).
+ *
+ * \see cl_page_unmap()
+ * \see vvp_page_unmap()
+ */
+ int (*cpo_unmap)(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+ /**
+ * Checks whether underlying VM page is locked (in the suitable
+ * sense). Used for assertions.
+ *
+ * \retval -EBUSY: page is protected by a lock of a given mode;
+ * \retval -ENODATA: page is not protected by a lock;
+ * \retval 0: this layer cannot decide. (Should never happen.)
+ */
+ int (*cpo_is_vmlocked)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Page destruction.
+ */
+
+ /**
+ * Called when page is truncated from the object. Optional.
+ *
+ * \see cl_page_discard()
+ * \see vvp_page_discard(), osc_page_discard()
+ */
+ void (*cpo_discard)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /**
+ * Called when page is removed from the cache, and is about to being
+ * destroyed. Optional.
+ *
+ * \see cl_page_delete()
+ * \see vvp_page_delete(), osc_page_delete()
+ */
+ void (*cpo_delete)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /** Destructor. Frees resources and slice itself. */
+ void (*cpo_fini)(const struct lu_env *env,
+ struct cl_page_slice *slice);
+
+ /**
+ * Checks whether the page is protected by a cl_lock. This is a
+ * per-layer method, because certain layers have ways to check for the
+ * lock much more efficiently than through the generic locks scan, or
+ * implement locking mechanisms separate from cl_lock, e.g.,
+ * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+ * being canceled, or scheduled for cancellation as soon as the last
+ * user goes away, too.
+ *
+ * \retval -EBUSY: page is protected by a lock of a given mode;
+ * \retval -ENODATA: page is not protected by a lock;
+ * \retval 0: this layer cannot decide.
+ *
+ * \see cl_page_is_under_lock()
+ */
+ int (*cpo_is_under_lock)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+
+ /**
+ * Optional debugging helper. Prints given page slice.
+ *
+ * \see cl_page_print()
+ */
+ int (*cpo_print)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t p);
+ /**
+ * \name transfer
+ *
+ * Transfer methods. See comment on cl_req for a description of
+ * transfer formation and life-cycle.
+ *
+ * @{
+ */
+ /**
+ * Request type dependent vector of operations.
+ *
+ * Transfer operations depend on transfer mode (cl_req_type). To avoid
+ * passing transfer mode to each and every of these methods, and to
+ * avoid branching on request type inside of the methods, separate
+ * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+ * provided. That is, method invocation usually looks like
+ *
+ * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+ */
+ struct {
+ /**
+ * Called when a page is submitted for a transfer as a part of
+ * cl_page_list.
+ *
+ * \return 0 : page is eligible for submission;
+ * \return -EALREADY : skip this page;
+ * \return -ve : error.
+ *
+ * \see cl_page_prep()
+ */
+ int (*cpo_prep)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /**
+ * Completion handler. This is guaranteed to be eventually
+ * fired after cl_page_operations::cpo_prep() or
+ * cl_page_operations::cpo_make_ready() call.
+ *
+ * This method can be called in a non-blocking context. It is
+ * guaranteed however, that the page involved and its object
+ * are pinned in memory (and, hence, calling cl_page_put() is
+ * safe).
+ *
+ * \see cl_page_completion()
+ */
+ void (*cpo_completion)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret);
+ /**
+ * Called when cached page is about to be added to the
+ * cl_req as a part of req formation.
+ *
+ * \return 0 : proceed with this page;
+ * \return -EAGAIN : skip this page;
+ * \return -ve : error.
+ *
+ * \see cl_page_make_ready()
+ */
+ int (*cpo_make_ready)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Announce that this page is to be written out
+ * opportunistically, that is, page is dirty, it is not
+ * necessary to start write-out transfer right now, but
+ * eventually page has to be written out.
+ *
+ * Main caller of this is the write path (see
+ * vvp_io_commit_write()), using this method to build a
+ * "transfer cache" from which large transfers are then
+ * constructed by the req-formation engine.
+ *
+ * \todo XXX it would make sense to add page-age tracking
+ * semantics here, and to oblige the req-formation engine to
+ * send the page out not later than it is too old.
+ *
+ * \see cl_page_cache_add()
+ */
+ int (*cpo_cache_add)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ } io[CRT_NR];
+ /**
+ * Tell transfer engine that only [to, from] part of a page should be
+ * transmitted.
+ *
+ * This is used for immediate transfers.
+ *
+ * \todo XXX this is not very good interface. It would be much better
+ * if all transfer parameters were supplied as arguments to
+ * cl_io_operations::cio_submit() call, but it is not clear how to do
+ * this for page queues.
+ *
+ * \see cl_page_clip()
+ */
+ void (*cpo_clip)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int from, int to);
+ /**
+ * \pre the page was queued for transferring.
+ * \post page is removed from client's pending list, or -EBUSY
+ * is returned if it has already been in transferring.
+ *
+ * This is one of seldom page operation which is:
+ * 0. called from top level;
+ * 1. don't have vmpage locked;
+ * 2. every layer should synchronize execution of its ->cpo_cancel()
+ * with completion handlers. Osc uses client obd lock for this
+ * purpose. Based on there is no vvp_page_cancel and
+ * lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+ *
+ * \see osc_page_cancel().
+ */
+ int (*cpo_cancel)(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+ /**
+ * Write out a page by kernel. This is only called by ll_writepage
+ * right now.
+ *
+ * \see cl_page_flush()
+ */
+ int (*cpo_flush)(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+ /** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ cl_page_print(env, &msgdata, lu_cdebug_printer, page); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+ if (page->cp_type == CPT_CACHEABLE)
+ ++refc;
+ LASSERT(atomic_read(&page->cp_ref) > 0);
+ return (atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg) __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ * struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ * - vvp_lock (vvp specific data), and
+ * - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ * - lovsub_lock, and
+ * - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work. E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ * - placing a "hold" on a lock guarantees that lock will not be moved
+ * into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ * can be only acquired on a lock that is not in
+ * cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ * cl_lock::cll_holds. Hold protects lock from cancellation and
+ * destruction. Requests to cancel and destroy a lock on hold will be
+ * recorded, but only honored when last hold on a lock is released;
+ *
+ * - placing a "user" on a lock guarantees that lock will not leave
+ * cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ * cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ * states, once it enters this set. That is, if a user is added onto a
+ * lock in a state not from this set, it doesn't immediately enforce
+ * lock to move to this set, but once lock enters this set it will
+ * remain there until all users are removed. Lock users are counted in
+ * cl_lock::cll_users.
+ *
+ * User is used to assure that lock is not canceled or destroyed while
+ * it is being enqueued, or actively used by some IO.
+ *
+ * Currently, a user always comes with a hold (cl_lock_invariant()
+ * checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ * - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ * - for every state there are possible target states where lock can move
+ * into. They are tried in order. Attempts to move into next state are
+ * done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ * - if the transition can be performed immediately, state is changed,
+ * and mutex is released.
+ *
+ * - if the transition requires blocking, _try() function returns
+ * cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ * sleep, waiting for possibility of lock state change. It is woken
+ * up when some event occurs, that makes lock state change possible
+ * (e.g., the reception of the reply from the server), and repeats
+ * the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ * is called on each layer. Responsibility of this method is to add locks,
+ * needed by a given layer into cl_io.ci_lockset.
+ *
+ * - once locks for all layers were collected, they are sorted to avoid
+ * dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ * - when all locks are acquired, IO is performed;
+ *
+ * - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ * atomicity;
+ *
+ * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * example, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+ /** Object this lock is granted for. */
+ struct cl_object *cld_obj;
+ /** Index of the first page protected by this lock. */
+ pgoff_t cld_start;
+ /** Index of the last page (inclusive) protected by this lock. */
+ pgoff_t cld_end;
+ /** Group ID, for group lock */
+ __u64 cld_gid;
+ /** Lock mode. */
+ enum cl_lock_mode cld_mode;
+ /**
+ * flags to enqueue lock. A combination of bit-flags from
+ * enum cl_enq_flags.
+ */
+ __u32 cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr) \
+ cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \
+ (descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ * +------------------>NEW
+ * | |
+ * | | cl_enqueue_try()
+ * | |
+ * | cl_unuse_try() V
+ * | +--------------QUEUING (*)
+ * | | |
+ * | | | cl_enqueue_try()
+ * | | |
+ * | | cl_unuse_try() V
+ * sub-lock | +-------------ENQUEUED (*)
+ * canceled | | |
+ * | | | cl_wait_try()
+ * | | |
+ * | | (R)
+ * | | |
+ * | | V
+ * | | HELD<---------+
+ * | | | |
+ * | | | | cl_use_try()
+ * | | cl_unuse_try() | |
+ * | | | |
+ * | | V ---+
+ * | +------------>INTRANSIT (D) <--+
+ * | | |
+ * | cl_unuse_try() | | cached lock found
+ * | | | cl_use_try()
+ * | | |
+ * | V |
+ * +------------------CACHED---------+
+ * |
+ * (C)
+ * |
+ * V
+ * FREEING
+ *
+ * Legend:
+ *
+ * In states marked with (*) transition to the same state (i.e., a loop
+ * in the diagram) is possible.
+ *
+ * (R) is the point where Receive call-back is invoked: it allows layers
+ * to handle arrival of lock reply.
+ *
+ * (C) is the point where Cancellation call-back is invoked.
+ *
+ * (D) is the transit state which means the lock is changing.
+ *
+ * Transition to FREEING state is possible from any other state in the
+ * diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ */
+enum cl_lock_state {
+ /**
+ * Lock that wasn't yet enqueued
+ */
+ CLS_NEW,
+ /**
+ * Enqueue is in progress, blocking for some intermediate interaction
+ * with the other side.
+ */
+ CLS_QUEUING,
+ /**
+ * Lock is fully enqueued, waiting for server to reply when it is
+ * granted.
+ */
+ CLS_ENQUEUED,
+ /**
+ * Lock granted, actively used by some IO.
+ */
+ CLS_HELD,
+ /**
+ * This state is used to mark the lock is being used, or unused.
+ * We need this state because the lock may have several sublocks,
+ * so it's impossible to have an atomic way to bring all sublocks
+ * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
+ * at unuse case.
+ * If a thread is referring to a lock, and it sees the lock is in this
+ * state, it must wait for the lock.
+ * See state diagram for details.
+ */
+ CLS_INTRANSIT,
+ /**
+ * Lock granted, not used.
+ */
+ CLS_CACHED,
+ /**
+ * Lock is being destroyed.
+ */
+ CLS_FREEING,
+ CLS_NR
+};
+
+enum cl_lock_flags {
+ /**
+ * lock has been cancelled. This flag is never cleared once set (by
+ * cl_lock_cancel0()).
+ */
+ CLF_CANCELLED = 1 << 0,
+ /** cancellation is pending for this lock. */
+ CLF_CANCELPEND = 1 << 1,
+ /** destruction is pending for this lock. */
+ CLF_DOOMED = 1 << 2,
+ /** from enqueue RPC reply upcall. */
+ CLF_FROM_UPCALL= 1 << 3,
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ * - nested state-machines (top-lock state-machine composed of sub-lock
+ * state-machines), and
+ *
+ * - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+ /**
+ * Lock that is mutexed when closure construction is started. When
+ * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+ * origin is released before waiting.
+ */
+ struct cl_lock *clc_origin;
+ /**
+ * List of enclosed locks, so far. Locks are linked here through
+ * cl_lock::cll_inclosure.
+ */
+ struct list_head clc_list;
+ /**
+ * True iff closure is in a `wait' mode. This determines what
+ * cl_lock_enclosure() does when a lock L to be added to the closure
+ * is currently mutexed by some other thread.
+ *
+ * If cl_lock_closure::clc_wait is not set, then closure construction
+ * fails with CLO_REPEAT immediately.
+ *
+ * In wait mode, cl_lock_enclosure() waits until next attempt to build
+ * a closure might succeed. To this end it releases an origin mutex
+ * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+ * owned by the current thread, and then waits on L mutex (by grabbing
+ * it and immediately releasing), before returning CLO_REPEAT to the
+ * caller.
+ */
+ int clc_wait;
+ /** Number of locks in the closure. */
+ int clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+ /** Reference counter. */
+ atomic_t cll_ref;
+ /** List of slices. Immutable after creation. */
+ struct list_head cll_layers;
+ /**
+ * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+ * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+ */
+ struct list_head cll_linkage;
+ /**
+ * Parameters of this lock. Protected by
+ * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+ * cl_lock::cll_guard. Modified only on lock creation and in
+ * cl_lock_modify().
+ */
+ struct cl_lock_descr cll_descr;
+ /** Protected by cl_lock::cll_guard. */
+ enum cl_lock_state cll_state;
+ /** signals state changes. */
+ wait_queue_head_t cll_wq;
+ /**
+ * Recursive lock, most fields in cl_lock{} are protected by this.
+ *
+ * Locking rules: this mutex is never held across network
+ * communication, except when lock is being canceled.
+ *
+ * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+ * on a top-lock. Other direction is implemented through a
+ * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+ * by try-locking.
+ *
+ * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+ */
+ struct mutex cll_guard;
+ struct task_struct *cll_guarder;
+ int cll_depth;
+
+ /**
+ * the owner for INTRANSIT state
+ */
+ struct task_struct *cll_intransit_owner;
+ int cll_error;
+ /**
+ * Number of holds on a lock. A hold prevents a lock from being
+ * canceled and destroyed. Protected by cl_lock::cll_guard.
+ *
+ * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+ */
+ int cll_holds;
+ /**
+ * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+ * only. Lock user pins lock in CLS_HELD state. Protected by
+ * cl_lock::cll_guard.
+ *
+ * \see cl_wait(), cl_unuse().
+ */
+ int cll_users;
+ /**
+ * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+ * protected by cl_lock::cll_guard.
+ */
+ unsigned long cll_flags;
+ /**
+ * A linkage into a list of locks in a closure.
+ *
+ * \see cl_lock_closure
+ */
+ struct list_head cll_inclosure;
+ /**
+ * Confict lock at queuing time.
+ */
+ struct cl_lock *cll_conflict;
+ /**
+ * A list of references to this lock, for debugging.
+ */
+ struct lu_ref cll_reference;
+ /**
+ * A list of holds on this lock, for debugging.
+ */
+ struct lu_ref cll_holders;
+ /**
+ * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+ */
+ struct lu_ref_link cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+ /* "dep_map" name is assumed by lockdep.h macros. */
+ struct lockdep_map dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+ struct cl_lock *cls_lock;
+ /** Object slice corresponding to this lock slice. Immutable after
+ * creation. */
+ struct cl_object *cls_obj;
+ const struct cl_lock_operations *cls_ops;
+ /** Linkage into cl_lock::cll_layers. Immutable after creation. */
+ struct list_head cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+ /** operation cannot be completed immediately. Wait for state change. */
+ CLO_WAIT = 1,
+ /** operation had to release lock mutex, restart. */
+ CLO_REPEAT = 2,
+ /** lower layer re-enqueued. */
+ CLO_REENQUEUED = 3,
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+ /**
+ * \name statemachine
+ *
+ * State machine transitions. These 3 methods are called to transfer
+ * lock from one state to another, as described in the commentary
+ * above enum #cl_lock_state.
+ *
+ * \retval 0 this layer has nothing more to do to before
+ * transition to the target state happens;
+ *
+ * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+ * mutex, repeat invocation of transition method
+ * across all layers;
+ *
+ * \retval CLO_WAIT this layer cannot move to the target state
+ * immediately, as it has to wait for certain event
+ * (e.g., the communication with the server). It
+ * is guaranteed, that when the state transfer
+ * becomes possible, cl_lock::cll_wq wait-queue
+ * is signaled. Caller can wait for this event by
+ * calling cl_lock_state_wait();
+ *
+ * \retval -ve failure, abort state transition, move the lock
+ * into cl_lock_state::CLS_FREEING state, and set
+ * cl_lock::cll_error.
+ *
+ * Once all layers voted to agree to transition (by returning 0), lock
+ * is moved into corresponding target state. All state transition
+ * methods are optional.
+ */
+ /** @{ */
+ /**
+ * Attempts to enqueue the lock. Called top-to-bottom.
+ *
+ * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+ * \see osc_lock_enqueue()
+ */
+ int (*clo_enqueue)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_io *io, __u32 enqflags);
+ /**
+ * Attempts to wait for enqueue result. Called top-to-bottom.
+ *
+ * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+ */
+ int (*clo_wait)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Attempts to unlock the lock. Called bottom-to-top. In addition to
+ * usual return values of lock state-machine methods, this can return
+ * -ESTALE to indicate that lock cannot be returned to the cache, and
+ * has to be re-initialized.
+ * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
+ */
+ int (*clo_unuse)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Notifies layer that cached lock is started being used.
+ *
+ * \pre lock->cll_state == CLS_CACHED
+ *
+ * \see lov_lock_use(), osc_lock_use()
+ */
+ int (*clo_use)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /** @} statemachine */
+ /**
+ * A method invoked when lock state is changed (as a result of state
+ * transition). This is used, for example, to track when the state of
+ * a sub-lock changes, to propagate this change to the corresponding
+ * top-lock. Optional
+ *
+ * \see lovsub_lock_state()
+ */
+ void (*clo_state)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state st);
+ /**
+ * Returns true, iff given lock is suitable for the given io, idea
+ * being, that there are certain "unsafe" locks, e.g., ones acquired
+ * for O_APPEND writes, that we don't want to re-use for a normal
+ * write, to avoid the danger of cascading evictions. Optional. Runs
+ * under cl_object_header::coh_lock_guard.
+ *
+ * XXX this should take more information about lock needed by
+ * io. Probably lock description or something similar.
+ *
+ * \see lov_fits_into()
+ */
+ int (*clo_fits_into)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io);
+ /**
+ * \name ast
+ * Asynchronous System Traps. All of then are optional, all are
+ * executed bottom-to-top.
+ */
+ /** @{ */
+
+ /**
+ * Cancellation callback. Cancel a lock voluntarily, or under
+ * the request of server.
+ */
+ void (*clo_cancel)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Lock weighting ast. Executed to estimate how precious this lock
+ * is. The sum of results across all layers is used to determine
+ * whether lock worth keeping in cache given present memory usage.
+ *
+ * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+ */
+ unsigned long (*clo_weigh)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /** @} ast */
+
+ /**
+ * \see lovsub_lock_closure()
+ */
+ int (*clo_closure)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_lock_closure *closure);
+ /**
+ * Executed bottom-to-top when lock description changes (e.g., as a
+ * result of server granting more generous lock than was requested).
+ *
+ * \see lovsub_lock_modify()
+ */
+ int (*clo_modify)(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *updated);
+ /**
+ * Notifies layers (bottom-to-top) that lock is going to be
+ * destroyed. Responsibility of layers is to prevent new references on
+ * this lock from being acquired once this method returns.
+ *
+ * This can be called multiple times due to the races.
+ *
+ * \see cl_lock_delete()
+ * \see osc_lock_delete(), lovsub_lock_delete()
+ */
+ void (*clo_delete)(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ /**
+ * Destructor. Frees resources and the slice.
+ *
+ * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+ * \see osc_lock_fini()
+ */
+ void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+ /**
+ * Optional debugging helper. Prints given lock slice.
+ */
+ int (*clo_print)(const struct lu_env *env,
+ void *cookie, lu_printer_t p,
+ const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do { \
+ if (likely(expr)) \
+ break; \
+ \
+ CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \
+ LBUG(); \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ * - submit pages for an immediate transfer,
+ *
+ * - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ * - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+ unsigned pl_nr;
+ struct list_head pl_pages;
+ struct task_struct *pl_owner;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+ struct cl_page_list c2_qin;
+ struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ * (0) initialize io state through all layers;
+ *
+ * (1) loop: prepare chunk of work to do
+ *
+ * (2) call all layers to collect locks they need to process current chunk
+ *
+ * (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ * (4) process the chunk: call per-page methods
+ * (cl_io_operations::cio_read_page() for read,
+ * cl_io_operations::cio_prepare_write(),
+ * cl_io_operations::cio_commit_write() for write)
+ *
+ * (5) release locks
+ *
+ * (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+ /** read system call */
+ CIT_READ,
+ /** write system call */
+ CIT_WRITE,
+ /** truncate, utime system calls */
+ CIT_SETATTR,
+ /**
+ * page fault handling
+ */
+ CIT_FAULT,
+ /**
+ * fsync system call handling
+ * To write out a range of file
+ */
+ CIT_FSYNC,
+ /**
+ * Miscellaneous io. This is used for occasional io activity that
+ * doesn't fit into other types. Currently this is used for:
+ *
+ * - cancellation of an extent lock. This io exists as a context
+ * to write dirty pages from under the lock being canceled back
+ * to the server;
+ *
+ * - VM induced page write-out. An io context for writing page out
+ * for memory cleansing;
+ *
+ * - glimpse. An io context to acquire glimpse lock.
+ *
+ * - grouplock. An io context to acquire group lock.
+ *
+ * CIT_MISC io is used simply as a context in which locks and pages
+ * are manipulated. Such io has no internal "process", that is,
+ * cl_io_loop() is never called for it.
+ */
+ CIT_MISC,
+ CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+ /** Not initialized. */
+ CIS_ZERO,
+ /** Initialized. */
+ CIS_INIT,
+ /** IO iteration started. */
+ CIS_IT_STARTED,
+ /** Locks taken. */
+ CIS_LOCKED,
+ /** Actual IO is in progress. */
+ CIS_IO_GOING,
+ /** IO for the current iteration finished. */
+ CIS_IO_FINISHED,
+ /** Locks released. */
+ CIS_UNLOCKED,
+ /** Iteration completed. */
+ CIS_IT_ENDED,
+ /** cl_io finalized. */
+ CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+ struct cl_io *cis_io;
+ /** corresponding object slice. Immutable after creation. */
+ struct cl_object *cis_obj;
+ /** io operations. Immutable after creation. */
+ const struct cl_io_operations *cis_iop;
+ /**
+ * linkage into a list of all slices for a given cl_io, hanging off
+ * cl_io::ci_layers. Immutable after creation.
+ */
+ struct list_head cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+ /**
+ * Vector of io state transition methods for every io type.
+ *
+ * \see cl_page_operations::io
+ */
+ struct {
+ /**
+ * Prepare io iteration at a given layer.
+ *
+ * Called top-to-bottom at the beginning of each iteration of
+ * "io loop" (if it makes sense for this type of io). Here
+ * layer selects what work it will do during this iteration.
+ *
+ * \see cl_io_operations::cio_iter_fini()
+ */
+ int (*cio_iter_init) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Finalize io iteration.
+ *
+ * Called bottom-to-top at the end of each iteration of "io
+ * loop". Here layers can decide whether IO has to be
+ * continued.
+ *
+ * \see cl_io_operations::cio_iter_init()
+ */
+ void (*cio_iter_fini) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Collect locks for the current iteration of io.
+ *
+ * Called top-to-bottom to collect all locks necessary for
+ * this iteration. This methods shouldn't actually enqueue
+ * anything, instead it should post a lock through
+ * cl_io_lock_add(). Once all locks are collected, they are
+ * sorted and enqueued in the proper order.
+ */
+ int (*cio_lock) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Finalize unlocking.
+ *
+ * Called bottom-to-top to finish layer specific unlocking
+ * functionality, after generic code released all locks
+ * acquired by cl_io_operations::cio_lock().
+ */
+ void (*cio_unlock)(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Start io iteration.
+ *
+ * Once all locks are acquired, called top-to-bottom to
+ * commence actual IO. In the current implementation,
+ * top-level vvp_io_{read,write}_start() does all the work
+ * synchronously by calling generic_file_*(), so other layers
+ * are called when everything is done.
+ */
+ int (*cio_start)(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Called top-to-bottom at the end of io loop. Here layer
+ * might wait for an unfinished asynchronous io.
+ */
+ void (*cio_end) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ /**
+ * Called bottom-to-top to notify layers that read/write IO
+ * iteration finished, with \a nob bytes transferred.
+ */
+ void (*cio_advance)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ size_t nob);
+ /**
+ * Called once per io, bottom-to-top to release io resources.
+ */
+ void (*cio_fini) (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ } op[CIT_OP_NR];
+ struct {
+ /**
+ * Submit pages from \a queue->c2_qin for IO, and move
+ * successfully submitted pages into \a queue->c2_qout. Return
+ * non-zero if failed to submit even the single page. If
+ * submission failed after some pages were moved into \a
+ * queue->c2_qout, completion callback with non-zero ioret is
+ * executed on them.
+ */
+ int (*cio_submit)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ enum cl_req_type crt,
+ struct cl_2queue *queue);
+ } req_op[CRT_NR];
+ /**
+ * Read missing page.
+ *
+ * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+ * method, when it hits not-up-to-date page in the range. Optional.
+ *
+ * \pre io->ci_type == CIT_READ
+ */
+ int (*cio_read_page)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ const struct cl_page_slice *page);
+ /**
+ * Prepare write of a \a page. Called bottom-to-top by a top-level
+ * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+ * get data from user-level buffer.
+ *
+ * \pre io->ci_type == CIT_WRITE
+ *
+ * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+ * osc_io_prepare_write().
+ */
+ int (*cio_prepare_write)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ const struct cl_page_slice *page,
+ unsigned from, unsigned to);
+ /**
+ *
+ * \pre io->ci_type == CIT_WRITE
+ *
+ * \see vvp_io_commit_write(), lov_io_commit_write(),
+ * osc_io_commit_write().
+ */
+ int (*cio_commit_write)(const struct lu_env *env,
+ const struct cl_io_slice *slice,
+ const struct cl_page_slice *page,
+ unsigned from, unsigned to);
+ /**
+ * Optional debugging helper. Print given io slice.
+ */
+ int (*cio_print)(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+ /**
+ * instruct server to not block, if conflicting lock is found. Instead
+ * -EWOULDBLOCK is returned immediately.
+ */
+ CEF_NONBLOCK = 0x00000001,
+ /**
+ * take lock asynchronously (out of order), as it cannot
+ * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+ */
+ CEF_ASYNC = 0x00000002,
+ /**
+ * tell the server to instruct (though a flag in the blocking ast) an
+ * owner of the conflicting lock, that it can drop dirty pages
+ * protected by this lock, without sending them to the server.
+ */
+ CEF_DISCARD_DATA = 0x00000004,
+ /**
+ * tell the sub layers that it must be a `real' lock. This is used for
+ * mmapped-buffer locks and glimpse locks that must be never converted
+ * into lockless mode.
+ *
+ * \see vvp_mmap_locks(), cl_glimpse_lock().
+ */
+ CEF_MUST = 0x00000008,
+ /**
+ * tell the sub layers that never request a `real' lock. This flag is
+ * not used currently.
+ *
+ * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+ * conversion policy: ci_lockreq describes generic information of lock
+ * requirement for this IO, especially for locks which belong to the
+ * object doing IO; however, lock itself may have precise requirements
+ * that are described by the enqueue flags.
+ */
+ CEF_NEVER = 0x00000010,
+ /**
+ * for async glimpse lock.
+ */
+ CEF_AGL = 0x00000020,
+ /**
+ * mask of enq_flags.
+ */
+ CEF_MASK = 0x0000003f,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+ /** linkage into one of cl_lockset lists. */
+ struct list_head cill_linkage;
+ struct cl_lock_descr cill_descr;
+ struct cl_lock *cill_lock;
+ /** optional destructor */
+ void (*cill_fini)(const struct lu_env *env,
+ struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ * - holding extent locks over multiple ost's introduces the danger of
+ * "cascading timeouts";
+ *
+ * - holding multiple locks over the same ost is still dead-lock prone,
+ * see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ * - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ * - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ * - SNS has to take locks across full stripe for correctness;
+ *
+ * - in the case when user level buffer, supplied to {read,write}(file0),
+ * is a part of a memory mapped lustre file, client has to take a dlm
+ * locks on file0, and all files that back up the buffer (or a part of
+ * the buffer, that is being processed in the current chunk, in any
+ * case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+ /** locks to be acquired. */
+ struct list_head cls_todo;
+ /** locks currently being processed. */
+ struct list_head cls_curr;
+ /** locks acquired. */
+ struct list_head cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+ /** Always lock data (e.g., O_APPEND). */
+ CILR_MANDATORY = 0,
+ /** Layers are free to decide between local and global locking. */
+ CILR_MAYBE,
+ /** Never lock: there is no cache (e.g., liblustre). */
+ CILR_NEVER
+};
+
+enum cl_fsync_mode {
+ /** start writeback, do not wait for them to finish */
+ CL_FSYNC_NONE = 0,
+ /** start writeback and wait for them to finish */
+ CL_FSYNC_LOCAL = 1,
+ /** discard all of dirty pages in a specific file range */
+ CL_FSYNC_DISCARD = 2,
+ /** start writeback and make sure they have reached storage before
+ * return. OST_SYNC RPC must be issued and finished */
+ CL_FSYNC_ALL = 3
+};
+
+struct cl_io_rw_common {
+ loff_t crw_pos;
+ size_t crw_count;
+ int crw_nonblock;
+};
+
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+ /** type of this IO. Immutable after creation. */
+ enum cl_io_type ci_type;
+ /** current state of cl_io state machine. */
+ enum cl_io_state ci_state;
+ /** main object this io is against. Immutable after creation. */
+ struct cl_object *ci_obj;
+ /**
+ * Upper layer io, of which this io is a part of. Immutable after
+ * creation.
+ */
+ struct cl_io *ci_parent;
+ /** List of slices. Immutable after creation. */
+ struct list_head ci_layers;
+ /** list of locks (to be) acquired by this io. */
+ struct cl_lockset ci_lockset;
+ /** lock requirements, this is just a help info for sublayers. */
+ enum cl_io_lock_dmd ci_lockreq;
+ union {
+ struct cl_rd_io {
+ struct cl_io_rw_common rd;
+ } ci_rd;
+ struct cl_wr_io {
+ struct cl_io_rw_common wr;
+ int wr_append;
+ int wr_sync;
+ } ci_wr;
+ struct cl_io_rw_common ci_rw;
+ struct cl_setattr_io {
+ struct ost_lvb sa_attr;
+ unsigned int sa_valid;
+ struct obd_capa *sa_capa;
+ } ci_setattr;
+ struct cl_fault_io {
+ /** page index within file. */
+ pgoff_t ft_index;
+ /** bytes valid byte on a faulted page. */
+ int ft_nob;
+ /** writable page? for nopage() only */
+ int ft_writable;
+ /** page of an executable? */
+ int ft_executable;
+ /** page_mkwrite() */
+ int ft_mkwrite;
+ /** resulting page */
+ struct cl_page *ft_page;
+ } ci_fault;
+ struct cl_fsync_io {
+ loff_t fi_start;
+ loff_t fi_end;
+ struct obd_capa *fi_capa;
+ /** file system level fid */
+ struct lu_fid *fi_fid;
+ enum cl_fsync_mode fi_mode;
+ /* how many pages were written/discarded */
+ unsigned int fi_nr_written;
+ } ci_fsync;
+ } u;
+ struct cl_2queue ci_queue;
+ size_t ci_nob;
+ int ci_result;
+ unsigned int ci_continue:1,
+ /**
+ * This io has held grouplock, to inform sublayers that
+ * don't do lockless i/o.
+ */
+ ci_no_srvlock:1,
+ /**
+ * The whole IO need to be restarted because layout has been changed
+ */
+ ci_need_restart:1,
+ /**
+ * to not refresh layout - the IO issuer knows that the layout won't
+ * change(page operations, layout change causes all page to be
+ * discarded), or it doesn't matter if it changes(sync).
+ */
+ ci_ignore_layout:1,
+ /**
+ * Check if layout changed after the IO finishes. Mainly for HSM
+ * requirement. If IO occurs to openning files, it doesn't need to
+ * verify layout because HSM won't release openning files.
+ * Right now, only two operations need to verify layout: glimpse
+ * and setattr.
+ */
+ ci_verify_layout:1,
+ /**
+ * file is released, restore has to to be triggered by vvp layer
+ */
+ ci_restore_needed:1,
+ /**
+ * O_NOATIME
+ */
+ ci_noatime:1;
+ /**
+ * Number of pages owned by this IO. For invariant checking.
+ */
+ unsigned ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ * - immediate transfer: this is started when a high level io wants a page
+ * or a collection of pages to be transferred right away. Examples:
+ * read-ahead, synchronous read in the case of non-page aligned write,
+ * page write-out as a part of extent lock cancellation, page write-out
+ * as a part of memory cleansing. Immediate transfer can be both
+ * cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ * - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ * when io wants to transfer a page to the server some time later, when
+ * it can be done efficiently. Example: pages dirtied by the write(2)
+ * path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+ /** Generic attributes for the server consumption. */
+ struct obdo *cra_oa;
+ /** Capability. */
+ struct obd_capa *cra_capa;
+ /** Jobid */
+ char cra_jobid[JOBSTATS_JOBID_SIZE];
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+ /**
+ * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+ * complete (all pages are added).
+ *
+ * \see osc_req_prep()
+ */
+ int (*cro_prep)(const struct lu_env *env,
+ const struct cl_req_slice *slice);
+ /**
+ * Called top-to-bottom to fill in \a oa fields. This is called twice
+ * with different flags, see bug 10150 and osc_build_req().
+ *
+ * \param obj an object from cl_req which attributes are to be set in
+ * \a oa.
+ *
+ * \param oa struct obdo where attributes are placed
+ *
+ * \param flags \a oa fields to be filled.
+ */
+ void (*cro_attr_set)(const struct lu_env *env,
+ const struct cl_req_slice *slice,
+ const struct cl_object *obj,
+ struct cl_req_attr *attr, u64 flags);
+ /**
+ * Called top-to-bottom from cl_req_completion() to notify layers that
+ * transfer completed. Has to free all state allocated by
+ * cl_device_operations::cdo_req_init().
+ */
+ void (*cro_completion)(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+ /** object itself */
+ struct cl_object *ro_obj;
+ /** reference to cl_req_obj::ro_obj. For debugging. */
+ struct lu_ref_link ro_obj_ref;
+ /* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+ enum cl_req_type crq_type;
+ /** A list of pages being transferred */
+ struct list_head crq_pages;
+ /** Number of pages in cl_req::crq_pages */
+ unsigned crq_nrpages;
+ /** An array of objects which pages are in ->crq_pages */
+ struct cl_req_obj *crq_o;
+ /** Number of elements in cl_req::crq_objs[] */
+ unsigned crq_nrobjs;
+ struct list_head crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+ struct cl_req *crs_req;
+ struct cl_device *crs_dev;
+ struct list_head crs_linkage;
+ const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+enum cache_stats_item {
+ /** how many cache lookups were performed */
+ CS_lookup = 0,
+ /** how many times cache lookup resulted in a hit */
+ CS_hit,
+ /** how many entities are in the cache right now */
+ CS_total,
+ /** how many entities in the cache are actively used (and cannot be
+ * evicted) right now */
+ CS_busy,
+ /** how many entities were created at all */
+ CS_create,
+ CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+ const char *cs_name;
+ atomic_t cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+ struct lu_site cs_lu;
+ /**
+ * Statistical counters. Atomics do not scale, something better like
+ * per-cpu counters is needed.
+ *
+ * These are exported as /proc/fs/lustre/llite/.../site
+ *
+ * When interpreting keep in mind that both sub-locks (and sub-pages)
+ * and top-locks (and top-pages) are accounted here.
+ */
+ struct cache_stats cs_pages;
+ struct cache_stats cs_locks;
+ atomic_t cs_pages_state[CPS_NR];
+ atomic_t cs_locks_state[CLS_NR];
+};
+
+int cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+ return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+ return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+ LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+ return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+ return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+ LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+ return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+ return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+ return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+ LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+ return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+ return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+ return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+ return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+ return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+ lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+ struct cl_object *obj,
+ const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+ struct cl_object *obj,
+ const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+ struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+ struct cl_device *dev,
+ const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+ const struct lu_fid *fid,
+ const struct cl_object_conf *c);
+
+int cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put (const struct lu_env *env, struct cl_object *o);
+void cl_object_get (struct cl_object *o);
+void cl_object_attr_lock (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int cl_object_attr_get (const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr);
+int cl_object_attr_set (const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid);
+int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj,
+ struct ost_lvb *lvb);
+int cl_conf_set (const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf);
+void cl_object_prune (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill (const struct lu_env *env, struct cl_object *obj);
+int cl_object_has_locks (struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+ return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+ clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+ cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+ struct cl_page *page)
+{
+ return (void *)((char *)page + clob->co_slice_off);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+ CLP_GANG_OKAY = 0,
+ CLP_GANG_RESCHED,
+ CLP_GANG_AGAIN,
+ CLP_GANG_ABORT
+};
+
+/* callback of cl_page_gang_lookup() */
+typedef int (*cl_page_gang_cb_t) (const struct lu_env *, struct cl_io *,
+ struct cl_page *, void *);
+int cl_page_gang_lookup (const struct lu_env *env,
+ struct cl_object *obj,
+ struct cl_io *io,
+ pgoff_t start, pgoff_t end,
+ cl_page_gang_cb_t cb, void *cbdata);
+struct cl_page *cl_page_lookup (struct cl_object_header *hdr,
+ pgoff_t index);
+struct cl_page *cl_page_find (const struct lu_env *env,
+ struct cl_object *obj,
+ pgoff_t idx, struct page *vmpage,
+ enum cl_page_type type);
+struct cl_page *cl_page_find_sub (const struct lu_env *env,
+ struct cl_object *obj,
+ pgoff_t idx, struct page *vmpage,
+ struct cl_page *parent);
+void cl_page_get (struct cl_page *page);
+void cl_page_put (const struct lu_env *env,
+ struct cl_page *page);
+void cl_page_print (const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_page *pg);
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_page *pg);
+struct page *cl_page_vmpage (const struct lu_env *env,
+ struct cl_page *page);
+struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+ const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int cl_page_own (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+int cl_page_own_try (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+void cl_page_assume (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+void cl_page_unassume (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg);
+void cl_page_disown (const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page);
+int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int cl_page_prep (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+ struct cl_page *pg, enum cl_req_type crt, int ioret);
+int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+ enum cl_req_type crt);
+int cl_page_cache_add (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip (const struct lu_env *env, struct cl_page *pg,
+ int from, int to);
+int cl_page_cancel (const struct lu_env *env, struct cl_page *page);
+int cl_page_flush (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void cl_page_discard (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg);
+void cl_page_delete (const struct lu_env *env, struct cl_page *pg);
+int cl_page_unmap (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg);
+int cl_page_is_vmlocked (const struct lu_env *env,
+ const struct cl_page *pg);
+void cl_page_export (const struct lu_env *env,
+ struct cl_page *pg, int uptodate);
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page);
+loff_t cl_offset (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index (const struct cl_object *obj, loff_t offset);
+int cl_page_size (const struct cl_object *obj);
+int cl_pages_prune (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print (const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+ struct cl_object *obj, pgoff_t index,
+ struct cl_lock *except, int pending,
+ int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+ struct cl_object *obj,
+ struct cl_page *page,
+ struct cl_lock *except,
+ int pending, int canceld)
+{
+ LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+ return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+ pending, canceld);
+}
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+ const struct lu_device_type *dtype);
+
+void cl_lock_get (struct cl_lock *lock);
+void cl_lock_get_trust (struct cl_lock *lock);
+void cl_lock_put (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_hold_add (const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_unhold (const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_release (const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source);
+void cl_lock_user_add (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_user_del (const struct lu_env *env, struct cl_lock *lock);
+
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+ struct cl_lock *lock);
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state);
+int cl_lock_is_intransit(struct cl_lock *lock);
+
+int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
+ int keep_mutex);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ * - "try" functions that attempt to effect a state transition. If state
+ * transition is not possible right now (e.g., if it has to wait for some
+ * asynchronous event to occur), these functions return
+ * cl_lock_transition::CLO_WAIT.
+ *
+ * - "non-try" functions that implement synchronous blocking interface on
+ * top of non-blocking "try" functions. These functions repeatedly call
+ * corresponding "try" versions, and if state transition is not possible
+ * immediately, wait for lock state change.
+ *
+ * - methods from cl_lock_operations, called by "try" functions. Lock can
+ * be advanced to the target state only when all layers voted that they
+ * are ready for this transition. "Try" functions call methods under lock
+ * mutex. If a layer had to release a mutex, it re-acquires it and returns
+ * cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ * layers again.
+ *
+ * TRY NON-TRY METHOD FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try() cl_wait() cl_lock_operations::clo_wait() CLS_HELD
+ *
+ * cl_unuse_try() cl_unuse() cl_lock_operations::clo_unuse() CLS_CACHED
+ *
+ * cl_use_try() NONE cl_lock_operations::clo_use() CLS_HELD
+ *
+ * @{ */
+
+int cl_enqueue (const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 flags);
+int cl_wait (const struct lu_env *env, struct cl_lock *lock);
+void cl_unuse (const struct lu_env *env, struct cl_lock *lock);
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 flags);
+int cl_unuse_try (const struct lu_env *env, struct cl_lock *lock);
+int cl_wait_try (const struct lu_env *env, struct cl_lock *lock);
+int cl_use_try (const struct lu_env *env, struct cl_lock *lock, int atomic);
+
+/** @} statemachine */
+
+void cl_lock_signal (const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_state_wait (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set (const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state);
+int cl_queue_match (const struct list_head *queue,
+ const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get (const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_mutex_try (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_is_mutexed (struct cl_lock *lock);
+int cl_lock_nr_mutexed (const struct lu_env *env);
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
+int cl_lock_ext_match (const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need);
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need);
+int cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int cl_lock_modify (const struct lu_env *env, struct cl_lock *lock,
+ const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+ struct cl_lock_closure *closure,
+ struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure);
+void cl_lock_disclosure (const struct lu_env *env,
+ struct cl_lock_closure *closure);
+int cl_lock_enclosure (const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int cl_io_init (const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj);
+int cl_io_sub_init (const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj);
+int cl_io_rw_init (const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, loff_t pos, size_t count);
+int cl_io_loop (const struct lu_env *env, struct cl_io *io);
+
+void cl_io_fini (const struct lu_env *env, struct cl_io *io);
+int cl_io_iter_init (const struct lu_env *env, struct cl_io *io);
+void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io);
+int cl_io_lock (const struct lu_env *env, struct cl_io *io);
+void cl_io_unlock (const struct lu_env *env, struct cl_io *io);
+int cl_io_start (const struct lu_env *env, struct cl_io *io);
+void cl_io_end (const struct lu_env *env, struct cl_io *io);
+int cl_io_lock_add (const struct lu_env *env, struct cl_io *io,
+ struct cl_io_lock_link *link);
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_lock_descr *descr);
+int cl_io_read_page (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page);
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to);
+int cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to);
+int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type iot, struct cl_2queue *queue);
+int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type iot, struct cl_2queue *queue,
+ long timeout);
+void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io,
+ size_t nob);
+int cl_io_cancel (const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue);
+int cl_io_is_going (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+ return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+ return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+ return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+ return io->ci_type == CIT_SETATTR &&
+ (io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base) \
+do { \
+ typeof(foo_io) __foo_io = (foo_io); \
+ \
+ CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \
+ memset(&__foo_io->base + 1, 0, \
+ sizeof(*__foo_io) - sizeof(__foo_io->base)); \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+ LASSERT(plist->pl_nr > 0);
+ return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list) \
+ list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list) \
+ list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init (struct cl_page_list *plist);
+void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src,
+ struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+ struct cl_page_list *head);
+void cl_page_list_del (const struct lu_env *env,
+ struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+int cl_page_list_own (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+int cl_page_list_unmap (const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init (struct cl_2queue *queue);
+void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown (const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume (const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard (const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+ enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add (const struct lu_env *env, struct cl_req *req,
+ struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int cl_req_prep (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set (const struct lu_env *env, struct cl_req *req,
+ struct cl_req_attr *attr, u64 flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+ /** number of pages yet to be transferred. */
+ atomic_t csi_sync_nr;
+ /** error code. */
+ int csi_sync_rc;
+ /** barrier of destroy this structure */
+ atomic_t csi_barrier;
+ /** completion to be signaled when transfer is complete. */
+ wait_queue_head_t csi_waitq;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue, struct cl_sync_io *anchor,
+ long timeout);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ * - there is a (mostly) fixed number of threads, and
+ *
+ * - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumption fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ * - allocation and destruction of environment is amortized by caching no
+ * longer used environments instead of destroying them;
+ *
+ * - there is a notion of "current" environment, attached to the kernel
+ * data structure representing current thread Top-level lustre code
+ * allocates an environment and makes it current, then calls into
+ * non-lustre code, that in turn calls lustre back. Low-level lustre
+ * code thus called can fetch environment created by the top-level code
+ * and reuse it, avoiding additional environment allocation.
+ * Right now, three interfaces can attach the cl_env to running thread:
+ * - cl_env_get
+ * - cl_env_implant
+ * - cl_env_reexit(cl_env_reenter had to be called priorly)
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+ int cen_refcheck;
+ void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek (int *refcheck);
+struct lu_env *cl_env_get (int *refcheck);
+struct lu_env *cl_env_alloc (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void cl_env_put (struct lu_env *env, int *refcheck);
+void cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void *cl_env_reenter (void);
+void cl_env_reexit (void *cookie);
+void cl_env_implant (struct lu_env *env, int *refcheck);
+void cl_env_unplant (struct lu_env *env, int *refcheck);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+ struct lu_device_type *ldt,
+ struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h
new file mode 100644
index 000000000..be4c7d95e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/dt_object.h
@@ -0,0 +1,1499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include "lu_object.h"
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+ MNTOPT_USERXATTR = 0x00000001,
+ MNTOPT_ACL = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+ unsigned ddp_max_name_len;
+ unsigned ddp_max_nlink;
+ unsigned ddp_block_shift;
+ mntopt_t ddp_mntopts;
+ unsigned ddp_max_ea_size;
+ void *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */
+ int ddp_mount_type;
+ unsigned long long ddp_maxbytes;
+ /* percentage of available space to reserve for grant error margin */
+ int ddp_grant_reserved;
+ /* per-inode space consumption */
+ short ddp_inodespace;
+ /* per-fragment grant overhead to be used by client for grant
+ * calculation */
+ int ddp_grant_frag;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+ struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC 0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN 32
+
+struct dt_txn_commit_cb {
+ struct list_head dcb_linkage;
+ dt_cb_t dcb_func;
+ __u32 dcb_magic;
+ char dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+ /**
+ * Return device-wide statistics.
+ */
+ int (*dt_statfs)(const struct lu_env *env,
+ struct dt_device *dev, struct obd_statfs *osfs);
+ /**
+ * Create transaction, described by \a param.
+ */
+ struct thandle *(*dt_trans_create)(const struct lu_env *env,
+ struct dt_device *dev);
+ /**
+ * Start transaction, described by \a param.
+ */
+ int (*dt_trans_start)(const struct lu_env *env,
+ struct dt_device *dev, struct thandle *th);
+ /**
+ * Finish previously started transaction.
+ */
+ int (*dt_trans_stop)(const struct lu_env *env,
+ struct thandle *th);
+ /**
+ * Add commit callback to the transaction.
+ */
+ int (*dt_trans_cb_add)(struct thandle *th,
+ struct dt_txn_commit_cb *dcb);
+ /**
+ * Return fid of root index object.
+ */
+ int (*dt_root_get)(const struct lu_env *env,
+ struct dt_device *dev, struct lu_fid *f);
+ /**
+ * Return device configuration data.
+ */
+ void (*dt_conf_get)(const struct lu_env *env,
+ const struct dt_device *dev,
+ struct dt_device_param *param);
+ /**
+ * handling device state, mostly for tests
+ */
+ int (*dt_sync)(const struct lu_env *env, struct dt_device *dev);
+ int (*dt_ro)(const struct lu_env *env, struct dt_device *dev);
+ /**
+ * Start a transaction commit asynchronously
+ *
+ * \param env environment
+ * \param dev dt_device to start commit on
+ *
+ * \return 0 success, negative value if error
+ */
+ int (*dt_commit_async)(const struct lu_env *env,
+ struct dt_device *dev);
+ /**
+ * Initialize capability context.
+ */
+ int (*dt_init_capa_ctxt)(const struct lu_env *env,
+ struct dt_device *dev,
+ int mode, unsigned long timeout,
+ __u32 alg, struct lustre_capa_key *keys);
+};
+
+struct dt_index_features {
+ /** required feature flags from enum dt_index_flags */
+ __u32 dif_flags;
+ /** minimal required key size */
+ size_t dif_keysize_min;
+ /** maximal required key size, 0 if no limit */
+ size_t dif_keysize_max;
+ /** minimal required record size */
+ size_t dif_recsize_min;
+ /** maximal required record size, 0 if no limit */
+ size_t dif_recsize_max;
+ /** pointer size for record */
+ size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+ /** index supports variable sized keys */
+ DT_IND_VARKEY = 1 << 0,
+ /** index supports variable sized records */
+ DT_IND_VARREC = 1 << 1,
+ /** index can be modified */
+ DT_IND_UPDATE = 1 << 2,
+ /** index supports records with non-unique (duplicate) keys */
+ DT_IND_NONUNQ = 1 << 3,
+ /**
+ * index support fixed-size keys sorted with natural numerical way
+ * and is able to return left-side value if no exact value found
+ */
+ DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+ struct dt_object *dah_parent;
+ __u32 dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+ DFT_REGULAR,
+ DFT_DIR,
+ /** for mknod */
+ DFT_NODE,
+ /** for special index */
+ DFT_INDEX,
+ /** for symbolic link */
+ DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+ /** type for dt object */
+ enum dt_format_type dof_type;
+ union {
+ struct dof_regular {
+ int striped;
+ } dof_reg;
+ struct dof_dir {
+ } dof_dir;
+ struct dof_node {
+ } dof_node;
+ /**
+ * special index need feature as parameter to create
+ * special idx
+ */
+ struct dof_index {
+ const struct dt_index_features *di_feat;
+ } dof_idx;
+ } u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+/**
+ * Per-dt-object operations.
+ */
+struct dt_object_operations {
+ void (*do_read_lock)(const struct lu_env *env,
+ struct dt_object *dt, unsigned role);
+ void (*do_write_lock)(const struct lu_env *env,
+ struct dt_object *dt, unsigned role);
+ void (*do_read_unlock)(const struct lu_env *env,
+ struct dt_object *dt);
+ void (*do_write_unlock)(const struct lu_env *env,
+ struct dt_object *dt);
+ int (*do_write_locked)(const struct lu_env *env,
+ struct dt_object *dt);
+ /**
+ * Note: following ->do_{x,}attr_{set,get}() operations are very
+ * similar to ->moo_{x,}attr_{set,get}() operations in struct
+ * md_object_operations (see md_object.h). These operations are not in
+ * lu_object_operations, because ->do_{x,}attr_set() versions take
+ * transaction handle as an argument (this transaction is started by
+ * caller). We might factor ->do_{x,}attr_get() into
+ * lu_object_operations, but that would break existing symmetry.
+ */
+
+ /**
+ * Return standard attributes.
+ *
+ * precondition: lu_object_exists(&dt->do_lu);
+ */
+ int (*do_attr_get)(const struct lu_env *env,
+ struct dt_object *dt, struct lu_attr *attr,
+ struct lustre_capa *capa);
+ /**
+ * Set standard attributes.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_attr_set)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_attr *attr,
+ struct thandle *handle);
+ int (*do_attr_set)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_attr *attr,
+ struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Return a value of an extended attribute.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, const char *name,
+ struct lustre_capa *capa);
+ /**
+ * Set value of an extended attribute.
+ *
+ * \a fl - flags from enum lu_xattr_flags
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_xattr_set)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_buf *buf,
+ const char *name, int fl,
+ struct thandle *handle);
+ int (*do_xattr_set)(const struct lu_env *env,
+ struct dt_object *dt, const struct lu_buf *buf,
+ const char *name, int fl, struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Delete existing extended attribute.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_xattr_del)(const struct lu_env *env,
+ struct dt_object *dt,
+ const char *name, struct thandle *handle);
+ int (*do_xattr_del)(const struct lu_env *env,
+ struct dt_object *dt,
+ const char *name, struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Place list of existing extended attributes into \a buf (which has
+ * length len).
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_xattr_list)(const struct lu_env *env,
+ struct dt_object *dt, struct lu_buf *buf,
+ struct lustre_capa *capa);
+ /**
+ * Init allocation hint using parent object and child mode.
+ * (1) The \a parent might be NULL if this is a partial creation for
+ * remote object.
+ * (2) The type of child is in \a child_mode.
+ * (3) The result hint is stored in \a ah;
+ */
+ void (*do_ah_init)(const struct lu_env *env,
+ struct dt_allocation_hint *ah,
+ struct dt_object *parent,
+ struct dt_object *child,
+ umode_t child_mode);
+ /**
+ * Create new object on this device.
+ *
+ * precondition: !dt_object_exists(dt);
+ * postcondition: ergo(result == 0, dt_object_exists(dt));
+ */
+ int (*do_declare_create)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th);
+ int (*do_create)(const struct lu_env *env, struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th);
+
+ /**
+ Destroy object on this device
+ * precondition: !dt_object_exists(dt);
+ * postcondition: ergo(result == 0, dt_object_exists(dt));
+ */
+ int (*do_declare_destroy)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct thandle *th);
+ int (*do_destroy)(const struct lu_env *env, struct dt_object *dt,
+ struct thandle *th);
+
+ /**
+ * Announce that this object is going to be used as an index. This
+ * operation check that object supports indexing operations and
+ * installs appropriate dt_index_operations vector on success.
+ *
+ * Also probes for features. Operation is successful if all required
+ * features are supported.
+ */
+ int (*do_index_try)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_index_features *feat);
+ /**
+ * Add nlink of the object
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_ref_add)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+ int (*do_ref_add)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+ /**
+ * Del nlink of the object
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_declare_ref_del)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+ int (*do_ref_del)(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th);
+
+ struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lustre_capa *old,
+ __u64 opc);
+ int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj,
+ __u64 start, __u64 end);
+ /**
+ * Get object info of next level. Currently, only get inode from osd.
+ * This is only used by quota b=16542
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+ void **data);
+
+ /**
+ * Lock object.
+ */
+ int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+ struct lustre_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ void *policy);
+};
+
+/**
+ * Per-dt-object operations on "file body".
+ */
+struct dt_body_operations {
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos,
+ struct lustre_capa *capa);
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ ssize_t (*dbo_declare_write)(const struct lu_env *env,
+ struct dt_object *dt,
+ const loff_t size, loff_t pos,
+ struct thandle *handle);
+ ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_buf *buf, loff_t *pos,
+ struct thandle *handle, struct lustre_capa *capa,
+ int ignore_quota);
+ /*
+ * methods for zero-copy IO
+ */
+
+ /*
+ * precondition: dt_object_exists(dt);
+ * returns:
+ * < 0 - error code
+ * = 0 - illegal
+ * > 0 - number of local buffers prepared
+ */
+ int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt,
+ loff_t pos, ssize_t len, struct niobuf_local *lb,
+ int rw, struct lustre_capa *capa);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *lb, int nr);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *lb, int nr);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_declare_write_commit)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct niobuf_local *,
+ int, struct thandle *);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *, int, struct thandle *);
+ /*
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt,
+ struct niobuf_local *lnb, int nr);
+ int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt,
+ struct ll_user_fiemap *fm);
+ /**
+ * Punch object's content
+ * precondition: regular object, not index
+ */
+ int (*dbo_declare_punch)(const struct lu_env *, struct dt_object *,
+ __u64, __u64, struct thandle *th);
+ int (*dbo_punch)(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, struct thandle *th,
+ struct lustre_capa *capa);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index.
+ */
+struct dt_index_operations {
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt,
+ struct dt_rec *rec, const struct dt_key *key,
+ struct lustre_capa *capa);
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dio_declare_insert)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_rec *rec,
+ const struct dt_key *key,
+ struct thandle *handle);
+ int (*dio_insert)(const struct lu_env *env, struct dt_object *dt,
+ const struct dt_rec *rec, const struct dt_key *key,
+ struct thandle *handle, struct lustre_capa *capa,
+ int ignore_quota);
+ /**
+ * precondition: dt_object_exists(dt);
+ */
+ int (*dio_declare_delete)(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_key *key,
+ struct thandle *handle);
+ int (*dio_delete)(const struct lu_env *env, struct dt_object *dt,
+ const struct dt_key *key, struct thandle *handle,
+ struct lustre_capa *capa);
+ /**
+ * Iterator interface
+ */
+ struct dt_it_ops {
+ /**
+ * Allocate and initialize new iterator.
+ *
+ * precondition: dt_object_exists(dt);
+ */
+ struct dt_it *(*init)(const struct lu_env *env,
+ struct dt_object *dt,
+ __u32 attr,
+ struct lustre_capa *capa);
+ void (*fini)(const struct lu_env *env,
+ struct dt_it *di);
+ int (*get)(const struct lu_env *env,
+ struct dt_it *di,
+ const struct dt_key *key);
+ void (*put)(const struct lu_env *env,
+ struct dt_it *di);
+ int (*next)(const struct lu_env *env,
+ struct dt_it *di);
+ struct dt_key *(*key)(const struct lu_env *env,
+ const struct dt_it *di);
+ int (*key_size)(const struct lu_env *env,
+ const struct dt_it *di);
+ int (*rec)(const struct lu_env *env,
+ const struct dt_it *di,
+ struct dt_rec *rec,
+ __u32 attr);
+ __u64 (*store)(const struct lu_env *env,
+ const struct dt_it *di);
+ int (*load)(const struct lu_env *env,
+ const struct dt_it *di, __u64 hash);
+ int (*key_rec)(const struct lu_env *env,
+ const struct dt_it *di, void *key_rec);
+ } dio_it;
+};
+
+enum dt_otable_it_valid {
+ DOIV_ERROR_HANDLE = 0x0001,
+};
+
+enum dt_otable_it_flags {
+ /* Exit when fail. */
+ DOIF_FAILOUT = 0x0001,
+
+ /* Reset iteration position to the device beginning. */
+ DOIF_RESET = 0x0002,
+
+ /* There is up layer component uses the iteration. */
+ DOIF_OUTUSED = 0x0004,
+};
+
+/* otable based iteration needs to use the common DT interation APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT 16
+#define DT_OTABLE_IT_FLAGS_MASK 0xffff0000
+
+struct dt_device {
+ struct lu_device dd_lu_dev;
+ const struct dt_device_operations *dd_ops;
+
+ /**
+ * List of dt_txn_callback (see below). This is not protected in any
+ * way, because callbacks are supposed to be added/deleted only during
+ * single-threaded start-up shut-down procedures.
+ */
+ struct list_head dd_txn_callbacks;
+};
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+ return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device *lu2dt_dev(struct lu_device *l)
+{
+ LASSERT(lu_device_is_dt(l));
+ return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+ struct lu_object do_lu;
+ const struct dt_object_operations *do_ops;
+ const struct dt_body_operations *do_body_ops;
+ const struct dt_index_operations *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+ /* all initialized llog systems on this node linked by this */
+ struct list_head los_list;
+
+ /* how many handle's reference this los has */
+ atomic_t los_refcount;
+ struct dt_device *los_dev;
+ struct dt_object *los_obj;
+
+ /* data used to generate new fids */
+ struct mutex los_id_lock;
+ __u64 los_seq;
+ __u32 los_last_oid;
+};
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+ LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+ return container_of0(l, struct dt_object, do_lu);
+}
+
+int dt_object_init(struct dt_object *obj,
+ struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+ return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+ return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+ LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+ return container_of0(o, struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ * This transaction handle is allocated upon starting a new transaction,
+ * and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ * We do _NOT_ support nested transaction. So, every thread should only
+ * have one active transaction, and a transaction only belongs to one
+ * thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ * dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ * No RPC request should be issued inside transaction.
+ */
+struct thandle {
+ /** the dt device on which the transactions are executed */
+ struct dt_device *th_dev;
+
+ /** context for this transaction, tag is LCT_TX_HANDLE */
+ struct lu_context th_ctx;
+
+ /** additional tags (layers can add in declare) */
+ __u32 th_tags;
+
+ /** the last operation result in this transaction.
+ * this value is used in recovery */
+ __s32 th_result;
+
+ /** whether we need sync commit */
+ unsigned int th_sync:1;
+
+ /* local transation, no need to inform other layers */
+ unsigned int th_local:1;
+
+ /* In DNE, one transaction can be disassemblied into
+ * updates on several different MDTs, and these updates
+ * will be attached to th_remote_update_list per target.
+ * Only single thread will access the list, no need lock
+ */
+ struct list_head th_remote_update_list;
+ struct update_request *th_current_request;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+ int (*dtc_txn_start)(const struct lu_env *env,
+ struct thandle *txn, void *cookie);
+ int (*dtc_txn_stop)(const struct lu_env *env,
+ struct thandle *txn, void *cookie);
+ void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+ void *dtc_cookie;
+ __u32 dtc_tag;
+ struct list_head dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+ struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+ const char *name,
+ void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+ char *local, dt_entry_func_t entry_func,
+ void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+ const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+ struct dt_device *dt,
+ const char *dirname,
+ const char *filename,
+ struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object_format *dof,
+ struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+ struct dt_device *dev,
+ const struct lu_fid *fid,
+ struct lu_device *top_dev);
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+ const struct lu_fid *fid)
+{
+ return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev);
+}
+
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+ const struct lu_fid *first_fid,
+ struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+ struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o,
+ struct lu_attr *attr,
+ struct dt_object_format *dof,
+ struct thandle *th);
+int local_object_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o,
+ struct lu_attr *attr, struct dt_object_format *dof,
+ struct thandle *th);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *parent,
+ const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object *parent,
+ const char *name,
+ __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *parent,
+ const char *name, __u32 mode,
+ const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object *parent,
+ const char *name, __u32 mode,
+ const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+ struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+ struct dt_object *o, struct lustre_handle *lh,
+ struct ldlm_enqueue_info *einfo,
+ void *policy)
+{
+ LASSERT(o);
+ LASSERT(o->do_ops);
+ LASSERT(o->do_ops->do_object_lock);
+ return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+ const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o,
+ __u64 start, __u64 end)
+{
+ LASSERT(o);
+ LASSERT(o->do_ops);
+ LASSERT(o->do_ops->do_object_sync);
+ return o->do_ops->do_object_sync(env, o, start, end);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+ struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+ dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+ union lu_page *lp, int nob,
+ const struct dt_it_ops *iops,
+ struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+ const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+ void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+ struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+ struct dt_device *d)
+{
+ LASSERT(d->dd_ops->dt_trans_create);
+ return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+ struct dt_device *d, struct thandle *th)
+{
+ LASSERT(d->dd_ops->dt_trans_start);
+ return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+ struct dt_device *d, struct thandle *th)
+{
+ LASSERT(d->dd_ops->dt_trans_start);
+ th->th_local = 1;
+ return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+ struct dt_device *d, struct thandle *th)
+{
+ LASSERT(d->dd_ops->dt_trans_stop);
+ return d->dd_ops->dt_trans_stop(env, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+ struct dt_txn_commit_cb *dcb)
+{
+ LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+ dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+ return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+ struct dt_object *dt,
+ int size, loff_t pos,
+ struct thandle *th)
+{
+ int rc;
+
+ LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+ LASSERT(th != NULL);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_declare_write);
+ rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th);
+ return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_create);
+ return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lu_attr *attr,
+ struct dt_allocation_hint *hint,
+ struct dt_object_format *dof,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_create);
+ return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+ struct dt_object *dt,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_destroy);
+ return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+ struct dt_object *dt,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_destroy);
+ return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+ struct dt_object *dt,
+ unsigned role)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_read_lock);
+ dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+ struct dt_object *dt,
+ unsigned role)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_write_lock);
+ dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+ struct dt_object *dt)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_read_unlock);
+ dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+ struct dt_object *dt)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_write_unlock);
+ dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+ struct dt_object *dt)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_write_locked);
+ return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+ struct lu_attr *la, void *arg)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_attr_get);
+ return dt->do_ops->do_attr_get(env, dt, la, arg);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_attr *la,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_attr_set);
+ return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_attr *la, struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_attr_set);
+ return dt->do_ops->do_attr_set(env, dt, la, th, capa);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_ref_add);
+ return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_ref_add);
+ return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_ref_del);
+ return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+ struct dt_object *dt, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_ref_del);
+ return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline struct obd_capa *dt_capa_get(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lustre_capa *old, __u64 opc)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_ref_del);
+ return dt->do_ops->do_capa_get(env, dt, old, opc);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_remote *rnb,
+ struct niobuf_local *lnb, int rw,
+ struct lustre_capa *capa)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_bufs_get);
+ return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset,
+ rnb->len, lnb, rw, capa);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_local *lnb, int n)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_bufs_put);
+ return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_local *lnb, int n)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_write_prep);
+ return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+ struct dt_object *d,
+ struct niobuf_local *lnb,
+ int n, struct thandle *th)
+{
+ LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+ LASSERT(th != NULL);
+ return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+ struct dt_object *d, struct niobuf_local *lnb,
+ int n, struct thandle *th)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_write_commit);
+ return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+ struct niobuf_local *lnb, int n)
+{
+ LASSERT(d);
+ LASSERT(d->do_body_ops);
+ LASSERT(d->do_body_ops->dbo_read_prep);
+ return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+ struct dt_object *dt, __u64 start,
+ __u64 end, struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_declare_punch);
+ return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end, struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_punch);
+ return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+ struct ll_user_fiemap *fm)
+{
+ LASSERT(d);
+ if (d->do_body_ops == NULL)
+ return -EPROTO;
+ if (d->do_body_ops->dbo_fiemap_get == NULL)
+ return -EOPNOTSUPP;
+ return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+ struct obd_statfs *osfs)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_statfs);
+ return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+ struct lu_fid *f)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_root_get);
+ return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+ const struct dt_device *dev,
+ struct dt_device_param *param)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_conf_get);
+ return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_sync);
+ return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_ro);
+ return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_rec *rec,
+ const struct dt_key *key,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_declare_insert);
+ return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_rec *rec,
+ const struct dt_key *key,
+ struct thandle *th,
+ struct lustre_capa *capa,
+ int noquota)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_insert);
+ return dt->do_index_ops->dio_insert(env, dt, rec, key, th,
+ capa, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+ struct dt_object *dt,
+ const char *name,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_xattr_del);
+ return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+ struct dt_object *dt, const char *name,
+ struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_del);
+ return dt->do_ops->do_xattr_del(env, dt, name, th, capa);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct lu_buf *buf,
+ const char *name, int fl,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_declare_xattr_set);
+ return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+ struct dt_object *dt, const struct lu_buf *buf,
+ const char *name, int fl, struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_set);
+ return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+ struct dt_object *dt, struct lu_buf *buf,
+ const char *name, struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_get);
+ return dt->do_ops->do_xattr_get(env, dt, buf, name, capa);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env,
+ struct dt_object *dt, struct lu_buf *buf,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_ops);
+ LASSERT(dt->do_ops->do_xattr_list);
+ return dt->do_ops->do_xattr_list(env, dt, buf, capa);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_key *key,
+ struct thandle *th)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_declare_delete);
+ return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+ struct dt_object *dt,
+ const struct dt_key *key,
+ struct thandle *th,
+ struct lustre_capa *capa)
+{
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_delete);
+ return dt->do_index_ops->dio_delete(env, dt, key, th, capa);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+ struct dt_device *dev)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_commit_async);
+ return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+ struct dt_device *dev,
+ int mode, unsigned long timeout,
+ __u32 alg, struct lustre_capa_key *keys)
+{
+ LASSERT(dev);
+ LASSERT(dev->dd_ops);
+ LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+ return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+ timeout, alg, keys);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+ struct dt_object *dt,
+ struct dt_rec *rec,
+ const struct dt_key *key,
+ struct lustre_capa *capa)
+{
+ int ret;
+
+ LASSERT(dt);
+ LASSERT(dt->do_index_ops);
+ LASSERT(dt->do_index_ops->dio_lookup);
+
+ ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ ret = -ENOENT;
+ return ret;
+}
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+struct dt_find_hint {
+ struct lu_fid *dfh_fid;
+ struct dt_device *dfh_dt;
+ struct dt_object *dfh_o;
+};
+
+struct dt_thread_info {
+ char dti_buf[DT_MAX_PATH];
+ struct dt_find_hint dti_dfh;
+ struct lu_attr dti_attr;
+ struct lu_fid dti_fid;
+ struct dt_object_format dti_dof;
+ struct lustre_mdt_attrs dti_lma;
+ struct lu_buf dti_lb;
+ loff_t dti_off;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+ struct dt_thread_info *dti;
+
+ dti = lu_context_key_get(&env->le_ctx, &dt_key);
+ LASSERT(dti);
+ return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+#if defined (CONFIG_PROC_FS)
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
+#endif /* CONFIG_PROC_FS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h
new file mode 100644
index 000000000..bf9027d5f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/interval_tree.h
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include "../../include/linux/libcfs/libcfs.h" /* LASSERT. */
+
+struct interval_node {
+ struct interval_node *in_left;
+ struct interval_node *in_right;
+ struct interval_node *in_parent;
+ unsigned in_color:1,
+ in_intree:1, /** set if the node is in tree */
+ in_res1:30;
+ __u8 in_res2[4]; /** tags, 8-bytes aligned */
+ __u64 in_max_high;
+ struct interval_node_extent {
+ __u64 start;
+ __u64 end;
+ } in_extent;
+};
+
+enum interval_iter {
+ INTERVAL_ITER_CONT = 1,
+ INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+ return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+ return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+ return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+ __u64 start, __u64 end)
+{
+ LASSERT(start <= end);
+ node->in_extent.start = start;
+ node->in_extent.end = end;
+ node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ * should be stopped. It will then cause the iteration function to return
+ * immediately with return value INTERVAL_ITER_STOP.
+ * - callbacks for interval_iterate and interval_iterate_reverse: Every
+ * nodes in the tree will be set to @node before the callback being called
+ * - callback for interval_search: Only overlapped node will be set to @node
+ * before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+ void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+ struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+ struct interval_node_extent *ex,
+ interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root,
+ interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+ interval_callback_t func, void *data);
+
+void interval_expand(struct interval_node *root,
+ struct interval_node_extent *ext,
+ struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root,
+ struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+ struct interval_node_extent *ex);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h
new file mode 100644
index 000000000..c5c3a8d9e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lclient.h
@@ -0,0 +1,433 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ * Author: Oleg Drokin <oleg.drokin@sun.com>
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+ struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+ return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+ return cl_glimpse_size0(inode, 1);
+}
+
+/**
+ * Locking policy for setattr.
+ */
+enum ccc_setattr_lock_type {
+ /** Locking is done by server */
+ SETATTR_NOLOCK,
+ /** Extent lock is enqueued */
+ SETATTR_EXTENT_LOCK,
+ /** Existing local extent lock is used */
+ SETATTR_MATCH_LOCK
+};
+
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+ /** super class */
+ struct cl_io_slice cui_cl;
+ struct cl_io_lock_link cui_link;
+ /**
+ * I/O vector information to or from which read/write is going.
+ */
+ struct iov_iter *cui_iter;
+ /**
+ * Total size for the left IO.
+ */
+ size_t cui_tot_count;
+
+ union {
+ struct {
+ enum ccc_setattr_lock_type cui_local_lock;
+ } setattr;
+ } u;
+ /**
+ * True iff io is processing glimpse right now.
+ */
+ int cui_glimpse;
+ /**
+ * Layout version when this IO is initialized
+ */
+ __u32 cui_layout_gen;
+ /**
+ * File descriptor against which IO is done.
+ */
+ struct ll_file_data *cui_fd;
+ struct kiocb *cui_iocb;
+};
+
+/**
+ * True, if \a io is a normal io, False for splice_{read,write}.
+ * must be implemented in arch specific code.
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io);
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+ struct cl_lock_descr cti_descr;
+ struct cl_io cti_io;
+ struct cl_attr cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+ struct ccc_thread_info *info;
+
+ info = lu_context_key_get(&env->le_ctx, &ccc_key);
+ LASSERT(info != NULL);
+ return info;
+}
+
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+ struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+
+ memset(attr, 0, sizeof(*attr));
+ return attr;
+}
+
+static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env)
+{
+ struct cl_io *io = &ccc_env_info(env)->cti_io;
+
+ memset(io, 0, sizeof(*io));
+ return io;
+}
+
+struct ccc_session {
+ struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+ struct ccc_session *ses;
+
+ ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+ LASSERT(ses != NULL);
+ return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+ return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+ struct cl_object_header cob_header;
+ struct cl_object cob_cl;
+ struct inode *cob_inode;
+
+ /**
+ * A list of dirty pages pending IO in the cache. Used by
+ * SOM. Protected by ll_inode_info::lli_lock.
+ *
+ * \see ccc_page::cpg_pending_linkage
+ */
+ struct list_head cob_pending_list;
+
+ /**
+ * Access this counter is protected by inode->i_sem. Now that
+ * the lifetime of transient pages must be covered by inode sem,
+ * we don't need to hold any lock..
+ */
+ int cob_transient_pages;
+ /**
+ * Number of outstanding mmaps on this file.
+ *
+ * \see ll_vm_open(), ll_vm_close().
+ */
+ atomic_t cob_mmap_cnt;
+
+ /**
+ * various flags
+ * cob_discard_page_warned
+ * if pages belonging to this object are discarded when a client
+ * is evicted, some debug info will be printed, this flag will be set
+ * during processing the first discarded page, then avoid flooding
+ * debug message for lots of discarded pages.
+ *
+ * \see ll_dirty_page_discard_warn.
+ */
+ unsigned int cob_discard_page_warned:1;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+ struct cl_page_slice cpg_cl;
+ int cpg_defer_uptodate;
+ int cpg_ra_used;
+ int cpg_write_queued;
+ /**
+ * Non-empty iff this page is already counted in
+ * ccc_object::cob_pending_list. Protected by
+ * ccc_object::cob_pending_guard. This list is only used as a flag,
+ * that is, never iterated through, only checked for list_empty(), but
+ * having a list is useful for debugging.
+ */
+ struct list_head cpg_pending_linkage;
+ /** VM page */
+ struct page *cpg_page;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+ return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+
+struct ccc_device {
+ struct cl_device cdv_cl;
+ struct super_block *cdv_sb;
+ struct cl_device *cdv_next;
+};
+
+struct ccc_lock {
+ struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+ struct cl_req_slice crq_cl;
+};
+
+void *ccc_key_init (const struct lu_context *ctx,
+ struct lu_context_key *key);
+void ccc_key_fini (const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key);
+void ccc_session_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+
+int ccc_device_init (const struct lu_env *env,
+ struct lu_device *d,
+ const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+ struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg,
+ const struct lu_device_operations *luops,
+ const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+ struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev,
+ const struct cl_object_operations *clops,
+ const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env, struct ccc_object *vob,
+ const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io,
+ const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+ const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf);
+struct page *ccc_page_vmpage(const struct lu_env *env,
+ const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+int ccc_transient_page_own(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io, int nonblock);
+void ccc_transient_page_assume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io);
+void ccc_lock_delete(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_io *io, __u32 enqflags);
+int ccc_lock_use(const struct lu_env *env, const struct cl_lock_slice *slice);
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+ __u32 enqflags, enum cl_lock_mode mode,
+ pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+ __u32 enqflags, enum cl_lock_mode mode,
+ loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+ size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+ struct cl_io *io);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io, loff_t start, size_t count, int *exceed);
+void ccc_req_completion(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,
+ const struct cl_req_slice *slice,
+ const struct cl_object *obj,
+ struct cl_req_attr *oa, u64 flags);
+
+struct lu_device *ccc2lu_dev (struct ccc_device *vdv);
+struct lu_object *ccc2lu (struct ccc_object *vob);
+struct ccc_device *lu2ccc_dev (const struct lu_device *d);
+struct ccc_device *cl2ccc_dev (const struct cl_device *d);
+struct ccc_object *lu2ccc (const struct lu_object *obj);
+struct ccc_object *cl2ccc (const struct cl_object *obj);
+struct ccc_lock *cl2ccc_lock (const struct cl_lock_slice *slice);
+struct ccc_io *cl2ccc_io (const struct lu_env *env,
+ const struct cl_io_slice *slice);
+struct ccc_req *cl2ccc_req (const struct cl_req_slice *slice);
+struct page *cl2vm_page (const struct cl_page_slice *slice);
+struct inode *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object *cl_inode2ccc (struct inode *inode);
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+ struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+__u16 ll_dirent_type_get(struct lu_dirent *ent);
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+__u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+# define CLOBINVRNT(env, clob, expr) \
+ ((void)sizeof(env), (void)sizeof(clob), (void)sizeof(!!(expr)))
+
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
+int cl_ocd_update(struct obd_device *host,
+ struct obd_device *watched,
+ enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+ struct lu_env *cg_env;
+ struct cl_io *cg_io;
+ struct cl_lock *cg_lock;
+ unsigned long cg_gid;
+};
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+ struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
+
+/**
+ * New interfaces to get and put lov_stripe_md from lov layer. This violates
+ * layering because lov_stripe_md is supposed to be a private data in lov.
+ *
+ * NB: If you find you have to use these interfaces for your new code, please
+ * think about it again. These interfaces may be removed in the future for
+ * better layering. */
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj);
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm);
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
+void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
+
+/**
+ * Data structure managing a client's cached clean pages. An LRU of
+ * pages is maintained, along with other statistics.
+ */
+struct cl_client_cache {
+ atomic_t ccc_users; /* # of users (OSCs) of this data */
+ struct list_head ccc_lru; /* LRU list of cached clean pages */
+ spinlock_t ccc_lru_lock; /* lock for list */
+ atomic_t ccc_lru_left; /* # of LRU entries available */
+ unsigned long ccc_lru_max; /* Max # of LRU entries possible */
+ unsigned int ccc_lru_shrinkers; /* # of threads reclaiming */
+};
+
+#endif /*LCLIENT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
new file mode 100644
index 000000000..3925db160
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
@@ -0,0 +1,216 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_COMPAT25_H
+#define _LINUX_COMPAT25_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+
+#include "lustre_patchless_compat.h"
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a) late_initcall(a)
+#endif
+
+
+#define LTIME_S(time) (time.tv_sec)
+
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i) do {} while (0) /* for write unlock */
+# define inode_dio_read(i) atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP (0)
+#endif
+
+#define ll_vfs_rmdir(dir, entry, mnt) vfs_rmdir(dir, entry)
+#define ll_vfs_mkdir(inode, dir, mnt, mode) vfs_mkdir(inode, dir, mode)
+#define ll_vfs_link(old, mnt, dir, new, mnt1) vfs_link(old, dir, new)
+#define ll_vfs_unlink(inode, entry, mnt) vfs_unlink(inode, entry)
+#define ll_vfs_mknod(dir, entry, mnt, mode, dev) \
+ vfs_mknod(dir, entry, mode, dev)
+#define ll_security_inode_unlink(dir, entry, mnt) \
+ security_inode_unlink(dir, entry)
+#define ll_vfs_rename(old, old_dir, mnt, new, new_dir, mnt1) \
+ vfs_rename(old, old_dir, new, new_dir, NULL, 0)
+
+#define cfs_bio_io_error(a, b) bio_io_error((a))
+#define cfs_bio_endio(a, b, c) bio_endio((a), (c))
+
+#define cfs_path_put(nd) path_put(&(nd)->path)
+
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+
+
+static inline int
+ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount)
+{
+ int rc;
+
+ if (sb->s_qcop->quota_on) {
+ struct path path;
+
+ rc = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (!rc)
+ return rc;
+ rc = sb->s_qcop->quota_on(sb, off, ver
+ , &path
+ );
+ path_put(&path);
+ return rc;
+ } else
+ return -ENOSYS;
+}
+
+static inline int ll_quota_off(struct super_block *sb, int off, int remount)
+{
+ if (sb->s_qcop->quota_off) {
+ return sb->s_qcop->quota_off(sb, off
+ );
+ } else
+ return -ENOSYS;
+}
+
+
+# define ll_vfs_dq_init dquot_initialize
+# define ll_vfs_dq_drop dquot_drop
+# define ll_vfs_dq_transfer dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+
+
+
+
+
+#define queue_max_phys_segments(rq) queue_max_segments(rq)
+#define queue_max_hw_segments(rq) queue_max_segments(rq)
+
+
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \
+ p = NULL; hlist_for_each_entry(dentry, i_dentry, alias)
+
+
+#define bio_hw_segments(q, bio) 0
+
+
+#define ll_pagevec_init(pv, cold) do {} while (0)
+#define ll_pagevec_add(pv, pg) (0)
+#define ll_pagevec_lru_add_file(pv) do {} while (0)
+
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA 3 /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE 4 /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit __test_and_set_bit_le
+# define ext2_clear_bit __test_and_clear_bit_le
+# define ext2_test_bit test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+ flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+ return flag;
+}
+
+#include <linux/fs.h>
+
+# define ll_umode_t umode_t
+
+#include <linux/dcache.h>
+
+# define ll_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag)
+
+#endif /* _COMPAT25_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
new file mode 100644
index 000000000..a7658a99a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LL_H
+#define _LINUX_LL_H
+
+#ifndef _LL_H
+#error Do not #include this file directly. #include <lustre_lite.h> instead
+#endif
+
+
+#include <asm/statfs.h>
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/proc_fs.h>
+
+#include "../obd_class.h"
+#include "../lustre_net.h"
+#include "../lustre_ha.h"
+
+#include <linux/rbtree.h>
+#include "../../include/linux/lustre_compat25.h"
+#include <linux/pagemap.h>
+
+/* lprocfs.c */
+enum {
+ LPROC_LL_DIRTY_HITS = 0,
+ LPROC_LL_DIRTY_MISSES,
+ LPROC_LL_READ_BYTES,
+ LPROC_LL_WRITE_BYTES,
+ LPROC_LL_BRW_READ,
+ LPROC_LL_BRW_WRITE,
+ LPROC_LL_OSC_READ,
+ LPROC_LL_OSC_WRITE,
+ LPROC_LL_IOCTL,
+ LPROC_LL_OPEN,
+ LPROC_LL_RELEASE,
+ LPROC_LL_MAP,
+ LPROC_LL_LLSEEK,
+ LPROC_LL_FSYNC,
+ LPROC_LL_READDIR,
+ LPROC_LL_SETATTR,
+ LPROC_LL_TRUNC,
+ LPROC_LL_FLOCK,
+ LPROC_LL_GETATTR,
+ LPROC_LL_CREATE,
+ LPROC_LL_LINK,
+ LPROC_LL_UNLINK,
+ LPROC_LL_SYMLINK,
+ LPROC_LL_MKDIR,
+ LPROC_LL_RMDIR,
+ LPROC_LL_MKNOD,
+ LPROC_LL_RENAME,
+ LPROC_LL_STAFS,
+ LPROC_LL_ALLOC_INODE,
+ LPROC_LL_SETXATTR,
+ LPROC_LL_GETXATTR,
+ LPROC_LL_GETXATTR_HITS,
+ LPROC_LL_LISTXATTR,
+ LPROC_LL_REMOVEXATTR,
+ LPROC_LL_INODE_PERM,
+ LPROC_LL_FILE_OPCODES
+};
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
new file mode 100644
index 000000000..d72605864
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+
+
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+ if (page->mapping != mapping)
+ return;
+
+ if (PagePrivate(page))
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping);
+
+ ClearPageMappedToDisk(page);
+ ll_delete_from_page_cache(page);
+}
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+# define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
new file mode 100644
index 000000000..9cc2849f3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
@@ -0,0 +1,70 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LINUX_LUSTRE_USER_H
+#define _LINUX_LUSTRE_USER_H
+
+# include <linux/quota.h>
+
+/*
+ * asm-x86_64/processor.h on some SLES 9 distros seems to use
+ * kernel-only typedefs. fortunately skipping it altogether is ok
+ * (for now).
+ */
+#define __ASM_X86_64_PROCESSOR_H
+
+#include <linux/string.h>
+
+/*
+ * We need to always use 64bit version because the structure
+ * is shared across entire cluster where 32bit and 64bit machines
+ * are co-existing.
+ */
+#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64)
+typedef struct stat64 lstat_t;
+#define lstat_f lstat64
+#else
+typedef struct stat lstat_t;
+#define lstat_f lstat
+#endif
+
+#define HAVE_LOV_USER_MDS_DATA
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h
new file mode 100644
index 000000000..9cd868357
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/linux/obd.h
@@ -0,0 +1,125 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_OBD_H
+#define __LINUX_OBD_H
+
+#ifndef __OBD_H
+#error Do not #include this file directly. #include <obd.h> instead
+#endif
+
+#include "../obd_support.h"
+
+# include <linux/fs.h>
+# include <linux/list.h>
+# include <linux/sched.h> /* for struct task_struct, for current.h */
+# include <linux/proc_fs.h>
+# include <linux/mount.h>
+#include "../lustre_intent.h"
+
+struct ll_iattr {
+ struct iattr iattr;
+ unsigned int ia_attr_flags;
+};
+
+#define CLIENT_OBD_LIST_LOCK_DEBUG 1
+
+typedef struct {
+ spinlock_t lock;
+
+ unsigned long time;
+ struct task_struct *task;
+ const char *func;
+ int line;
+} client_obd_lock_t;
+
+static inline void __client_obd_list_lock(client_obd_lock_t *lock,
+ const char *func, int line)
+{
+ unsigned long cur = jiffies;
+ while (1) {
+ if (spin_trylock(&lock->lock)) {
+ LASSERT(lock->task == NULL);
+ lock->task = current;
+ lock->func = func;
+ lock->line = line;
+ lock->time = jiffies;
+ break;
+ }
+
+ if (time_before(cur + 5 * HZ, jiffies) &&
+ time_before(lock->time + 5 * HZ, jiffies)) {
+ struct task_struct *task = lock->task;
+
+ if (task == NULL)
+ continue;
+
+ LCONSOLE_WARN("%s:%d: lock %p was acquired by <%s:%d:%s:%d> for %lu seconds.\n",
+ current->comm, current->pid,
+ lock, task->comm, task->pid,
+ lock->func, lock->line,
+ (jiffies - lock->time) / HZ);
+ LCONSOLE_WARN("====== for current process =====\n");
+ dump_stack();
+ LCONSOLE_WARN("====== end =======\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(1000 * HZ);
+ }
+ cpu_relax();
+ }
+}
+
+#define client_obd_list_lock(lock) \
+ __client_obd_list_lock(lock, __func__, __LINE__)
+
+static inline void client_obd_list_unlock(client_obd_lock_t *lock)
+{
+ LASSERT(lock->task != NULL);
+ lock->task = NULL;
+ lock->time = jiffies;
+ spin_unlock(&lock->lock);
+}
+
+
+static inline void client_obd_list_lock_init(client_obd_lock_t *lock)
+{
+ spin_lock_init(&lock->lock);
+}
+
+static inline void client_obd_list_lock_done(client_obd_lock_t *lock)
+{}
+
+#endif /* __LINUX_OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h
new file mode 100644
index 000000000..d030847e5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h
@@ -0,0 +1,1015 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_SNMP_H
+#define _LPROCFS_SNMP_H
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "lustre/lustre_idl.h"
+
+struct lprocfs_vars {
+ const char *name;
+ struct file_operations *fops;
+ void *data;
+ /**
+ * /proc file mode.
+ */
+ umode_t proc_mode;
+};
+
+struct lprocfs_static_vars {
+ struct lprocfs_vars *module_vars;
+ struct lprocfs_vars *obd_vars;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+ spinlock_t oh_lock;
+ unsigned long oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+ BRW_R_PAGES = 0,
+ BRW_W_PAGES,
+ BRW_R_RPC_HIST,
+ BRW_W_RPC_HIST,
+ BRW_R_IO_TIME,
+ BRW_W_IO_TIME,
+ BRW_R_DISCONT_PAGES,
+ BRW_W_DISCONT_PAGES,
+ BRW_R_DISCONT_BLOCKS,
+ BRW_W_DISCONT_BLOCKS,
+ BRW_R_DISK_IOSIZE,
+ BRW_W_DISK_IOSIZE,
+ BRW_R_DIO_FRAGS,
+ BRW_W_DIO_FRAGS,
+ BRW_LAST,
+};
+
+struct brw_stats {
+ struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+ RENAME_SAMEDIR_SIZE = 0,
+ RENAME_CROSSDIR_SRC_SIZE,
+ RENAME_CROSSDIR_TGT_SIZE,
+ RENAME_LAST,
+};
+
+struct rename_stats {
+ struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+ LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+ LPROCFS_CNTR_AVGMINMAX = 0x0002,
+ LPROCFS_CNTR_STDDEV = 0x0004,
+
+ /* counter data type */
+ LPROCFS_TYPE_REGS = 0x0100,
+ LPROCFS_TYPE_BYTES = 0x0200,
+ LPROCFS_TYPE_PAGES = 0x0400,
+ LPROCFS_TYPE_CYCLE = 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+ unsigned int lc_config;
+ const char *lc_name; /* must be static */
+ const char *lc_units; /* must be static */
+};
+
+struct lprocfs_counter {
+ __s64 lc_count;
+ __s64 lc_min;
+ __s64 lc_max;
+ __s64 lc_sumsquare;
+ /*
+ * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+ * for irq context counter, i.e. stats with
+ * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+ * lc_array_sum[1]
+ */
+ __s64 lc_array_sum[1];
+};
+#define lc_sum lc_array_sum[0]
+#define lc_sum_irq lc_array_sum[1]
+
+struct lprocfs_percpu {
+#ifndef __GNUC__
+ __s64 pad;
+#endif
+ struct lprocfs_counter lp_cntr[0];
+};
+
+#define LPROCFS_GET_NUM_CPU 0x0001
+#define LPROCFS_GET_SMP_ID 0x0002
+
+enum lprocfs_stats_flags {
+ LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */
+ LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+ * area and need locking */
+ LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+ LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001,
+ LPROCFS_FIELDS_FLAGS_SUM = 0x0002,
+ LPROCFS_FIELDS_FLAGS_MIN = 0x0003,
+ LPROCFS_FIELDS_FLAGS_MAX = 0x0004,
+ LPROCFS_FIELDS_FLAGS_AVG = 0x0005,
+ LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006,
+ LPROCFS_FIELDS_FLAGS_COUNT = 0x0007,
+};
+
+struct lprocfs_stats {
+ /* # of counters */
+ unsigned short ls_num;
+ /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+ unsigned short ls_biggest_alloc_num;
+ enum lprocfs_stats_flags ls_flags;
+ /* Lock used when there are no percpu stats areas; For percpu stats,
+ * it is used to protect ls_biggest_alloc_num change */
+ spinlock_t ls_lock;
+
+ /* has ls_num of counter headers */
+ struct lprocfs_counter_header *ls_cnt_header;
+ struct lprocfs_percpu *ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+ if (opc < OST_LAST_OPC) {
+ /* OST opcode */
+ return (opc - OST_FIRST_OPC);
+ } else if (opc < MDS_LAST_OPC) {
+ /* MDS opcode */
+ return (opc - MDS_FIRST_OPC +
+ OPC_RANGE(OST));
+ } else if (opc < LDLM_LAST_OPC) {
+ /* LDLM Opcode */
+ return (opc - LDLM_FIRST_OPC +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < MGS_LAST_OPC) {
+ /* MGS Opcode */
+ return (opc - MGS_FIRST_OPC +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < OBD_LAST_OPC) {
+ /* OBD Ping */
+ return (opc - OBD_FIRST_OPC +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < LLOG_LAST_OPC) {
+ /* LLOG Opcode */
+ return (opc - LLOG_FIRST_OPC +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < QUOTA_LAST_OPC) {
+ /* LQUOTA Opcode */
+ return (opc - QUOTA_FIRST_OPC +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < SEQ_LAST_OPC) {
+ /* SEQ opcode */
+ return (opc - SEQ_FIRST_OPC +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < SEC_LAST_OPC) {
+ /* SEC opcode */
+ return (opc - SEC_FIRST_OPC +
+ OPC_RANGE(SEQ) +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < FLD_LAST_OPC) {
+ /* FLD opcode */
+ return (opc - FLD_FIRST_OPC +
+ OPC_RANGE(SEC) +
+ OPC_RANGE(SEQ) +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else if (opc < UPDATE_LAST_OPC) {
+ /* update opcode */
+ return (opc - UPDATE_FIRST_OPC +
+ OPC_RANGE(FLD) +
+ OPC_RANGE(SEC) +
+ OPC_RANGE(SEQ) +
+ OPC_RANGE(QUOTA) +
+ OPC_RANGE(LLOG) +
+ OPC_RANGE(OBD) +
+ OPC_RANGE(MGS) +
+ OPC_RANGE(LDLM) +
+ OPC_RANGE(MDS) +
+ OPC_RANGE(OST));
+ } else {
+ /* Unknown Opcode */
+ return -1;
+ }
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST) + \
+ OPC_RANGE(MDS) + \
+ OPC_RANGE(LDLM) + \
+ OPC_RANGE(MGS) + \
+ OPC_RANGE(OBD) + \
+ OPC_RANGE(LLOG) + \
+ OPC_RANGE(SEC) + \
+ OPC_RANGE(SEQ) + \
+ OPC_RANGE(SEC) + \
+ OPC_RANGE(FLD) + \
+ OPC_RANGE(UPDATE))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \
+ OPC_RANGE(EXTRA))
+
+enum {
+ PTLRPC_REQWAIT_CNTR = 0,
+ PTLRPC_REQQDEPTH_CNTR,
+ PTLRPC_REQACTIVE_CNTR,
+ PTLRPC_TIMEOUT,
+ PTLRPC_REQBUF_AVAIL_CNTR,
+ PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum {
+ LDLM_GLIMPSE_ENQUEUE = 0,
+ LDLM_PLAIN_ENQUEUE,
+ LDLM_EXTENT_ENQUEUE,
+ LDLM_FLOCK_ENQUEUE,
+ LDLM_IBITS_ENQUEUE,
+ MDS_REINT_SETATTR,
+ MDS_REINT_CREATE,
+ MDS_REINT_LINK,
+ MDS_REINT_UNLINK,
+ MDS_REINT_RENAME,
+ MDS_REINT_OPEN,
+ MDS_REINT_SETXATTR,
+ BRW_READ_BYTES,
+ BRW_WRITE_BYTES,
+ EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern struct proc_dir_entry *proc_lustre_root;
+
+struct obd_device;
+struct obd_histogram;
+
+/* Days / hours / mins / seconds format */
+struct dhms {
+ int d, h, m, s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+ ts->d = secs / 86400;
+ secs = secs % 86400;
+ ts->h = secs / 3600;
+ secs = secs % 3600;
+ ts->m = secs / 60;
+ ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN 20
+#define JOBSTATS_DISABLE "disable"
+#define JOBSTATS_PROCNAME_UID "procname_uid"
+#define JOBSTATS_NODELOCAL "nodelocal"
+
+extern int lprocfs_write_frac_helper(const char __user *buffer,
+ unsigned long count, int *val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+ long val, int mult);
+#if defined (CONFIG_PROC_FS)
+
+extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+ unsigned int cpuid);
+/*
+ * \return value
+ * < 0 : on error (only possible for opc as LPROCFS_GET_SMP_ID)
+ */
+static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc,
+ unsigned long *flags)
+{
+ int rc = 0;
+
+ switch (opc) {
+ default:
+ LBUG();
+
+ case LPROCFS_GET_SMP_ID:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ spin_lock_irqsave(&stats->ls_lock, *flags);
+ else
+ spin_lock(&stats->ls_lock);
+ return 0;
+ } else {
+ unsigned int cpuid = get_cpu();
+
+ if (unlikely(stats->ls_percpu[cpuid] == NULL)) {
+ rc = lprocfs_stats_alloc_one(stats, cpuid);
+ if (rc < 0) {
+ put_cpu();
+ return rc;
+ }
+ }
+ return cpuid;
+ }
+
+ case LPROCFS_GET_NUM_CPU:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ spin_lock_irqsave(&stats->ls_lock, *flags);
+ else
+ spin_lock(&stats->ls_lock);
+ return 1;
+ } else {
+ return stats->ls_biggest_alloc_num;
+ }
+ }
+}
+
+static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc,
+ unsigned long *flags)
+{
+ switch (opc) {
+ default:
+ LBUG();
+
+ case LPROCFS_GET_SMP_ID:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+ spin_unlock_irqrestore(&stats->ls_lock,
+ *flags);
+ } else {
+ spin_unlock(&stats->ls_lock);
+ }
+ } else {
+ put_cpu();
+ }
+ return;
+
+ case LPROCFS_GET_NUM_CPU:
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+ spin_unlock_irqrestore(&stats->ls_lock,
+ *flags);
+ } else {
+ spin_unlock(&stats->ls_lock);
+ }
+ }
+ return;
+ }
+}
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+ unsigned int percpusize;
+
+ percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+ /* irq safe stats need lc_array_sum[1] */
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ percpusize += stats->ls_num * sizeof(__s64);
+
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+ percpusize = L1_CACHE_ALIGN(percpusize);
+
+ return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+ int index)
+{
+ struct lprocfs_counter *cntr;
+
+ cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ cntr = (void *)cntr + index * sizeof(__s64);
+
+ return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ * lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ * lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+ long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+ long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+ lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+ lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+ struct lprocfs_counter_header *header,
+ enum lprocfs_stats_flags flags,
+ enum lprocfs_fields_flags field);
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+ int idx,
+ enum lprocfs_fields_flags field)
+{
+ int i;
+ unsigned int num_cpu;
+ unsigned long flags = 0;
+ __u64 ret = 0;
+
+ LASSERT(stats != NULL);
+
+ num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+ for (i = 0; i < num_cpu; i++) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ ret += lprocfs_read_helper(
+ lprocfs_stats_counter_get(stats, i, idx),
+ &stats->ls_cnt_header[idx], stats->ls_flags,
+ field);
+ }
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+ return ret;
+}
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+ struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+ struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+ unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+ unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+ unsigned conf, const char *name,
+ const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device *obd,
+ struct proc_dir_entry *entry);
+extern int lprocfs_exp_setup(struct obd_export *exp,
+ lnet_nid_t *peer_nid, int *newnid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+extern struct proc_dir_entry *lprocfs_add_simple(struct proc_dir_entry *root,
+ char *name,
+ void *data,
+ struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+ const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+extern int
+lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data);
+
+extern int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+ struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int lprocfs_add_vars(struct proc_dir_entry *root,
+ struct lprocfs_vars *var,
+ void *data);
+
+extern struct proc_dir_entry *lprocfs_register(const char *name,
+ struct proc_dir_entry *parent,
+ struct lprocfs_vars *list,
+ void *data);
+
+extern void lprocfs_remove(struct proc_dir_entry **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+ struct proc_dir_entry *parent);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+
+extern int lprocfs_seq_create(struct proc_dir_entry *parent, const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data);
+
+/* Generic callbacks */
+
+extern int lprocfs_rd_u64(struct seq_file *m, void *data);
+extern int lprocfs_rd_atomic(struct seq_file *m, void *data);
+extern int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_rd_uint(struct seq_file *m, void *data);
+extern int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_rd_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_name(struct seq_file *m, void *data);
+extern int lprocfs_rd_server_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_conn_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_import(struct seq_file *m, void *data);
+extern int lprocfs_rd_state(struct seq_file *m, void *data);
+extern int lprocfs_rd_connect_flags(struct seq_file *m, void *data);
+extern int lprocfs_rd_num_exports(struct seq_file *m, void *data);
+extern int lprocfs_rd_numrefs(struct seq_file *m, void *data);
+
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(struct seq_file *m,
+ struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(struct seq_file *m, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char __user *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_wr_evict_client(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off);
+extern int lprocfs_wr_ping(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off);
+extern int lprocfs_wr_import(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off);
+extern int lprocfs_rd_pinger_recov(struct seq_file *m, void *n);
+extern int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off);
+
+/* Statfs helpers */
+extern int lprocfs_rd_blksize(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesfree(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesavail(struct seq_file *m, void *data);
+extern int lprocfs_rd_filestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_filesfree(struct seq_file *m, void *data);
+
+extern int lprocfs_write_helper(const char __user *buffer, unsigned long count,
+ int *val);
+extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
+extern int lprocfs_write_u64_helper(const char __user *buffer,
+ unsigned long count, __u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer,
+ unsigned long count,
+ __u64 *val, int mult);
+extern char *lprocfs_find_named_value(const char *buffer, const char *name,
+ size_t *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+ struct lprocfs_counter *cnt);
+
+extern int lprocfs_single_release(struct inode *, struct file *);
+extern int lprocfs_seq_release(struct inode *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do { \
+ typecheck(struct obd_device *, obd); \
+ down_read(&(obd)->u.cli.cl_sem); \
+ if ((obd)->u.cli.cl_import == NULL) { \
+ up_read(&(obd)->u.cli.cl_sem); \
+ return -ENODEV; \
+ } \
+} while (0)
+#define LPROCFS_CLIMP_EXIT(obd) \
+ up_read(&(obd)->u.cli.cl_sem)
+
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+ proc entries; otherwise, you will define name##_seq_write function also for
+ a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+ call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write) \
+static int name##_single_open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, name##_seq_show, PDE_DATA(inode)); \
+} \
+static struct file_operations name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = name##_single_open, \
+ .read = seq_read, \
+ .write = custom_seq_write, \
+ .llseek = seq_lseek, \
+ .release = lprocfs_single_release, \
+}
+
+#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write)
+
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type) \
+ static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+ { \
+ return lprocfs_rd_##type(m, m->private); \
+ } \
+ LPROC_SEQ_FOPS_RO(name##_##type)
+
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type) \
+ static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+ { \
+ return lprocfs_rd_##type(m, m->private); \
+ } \
+ static ssize_t name##_##type##_seq_write(struct file *file, \
+ const char __user *buffer, size_t count, \
+ loff_t *off) \
+ { \
+ struct seq_file *seq = file->private_data; \
+ return lprocfs_wr_##type(file, buffer, \
+ count, seq->private); \
+ } \
+ LPROC_SEQ_FOPS(name##_##type)
+
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type) \
+ static ssize_t name##_##type##_write(struct file *file, \
+ const char __user *buffer, size_t count, \
+ loff_t *off) \
+ { \
+ return lprocfs_wr_##type(file, buffer, count, off); \
+ } \
+ static int name##_##type##_open(struct inode *inode, struct file *file) \
+ { \
+ return single_open(file, NULL, PDE_DATA(inode)); \
+ } \
+ static struct file_operations name##_##type##_fops = { \
+ .open = name##_##type##_open, \
+ .write = name##_##type##_write, \
+ .release = lprocfs_single_release, \
+ }
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+/* lproc_status.c */
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data);
+int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer,
+ size_t count, loff_t *off);
+
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start,
+ loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count,
+ int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off,
+ int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data);
+#else
+/* CONFIG_PROC_FS is not defined */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+ int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+ int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+ int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+ int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+ int index, unsigned conf,
+ const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+ enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(struct proc_dir_entry *root,
+ const char *name,
+ struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+ struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+ struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+ unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+ unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline int lprocfs_exp_setup(struct obd_export *exp,
+ lnet_nid_t *peer_nid,
+ int *newnid)
+{ return 0; }
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+ void *data, struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+ const char *format, ...)
+{return NULL; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{return count;}
+static inline
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{ return 0; }
+
+static inline struct proc_dir_entry *
+lprocfs_register(const char *name, struct proc_dir_entry *parent,
+ struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline int lprocfs_add_vars(struct proc_dir_entry *root,
+ struct lprocfs_vars *var,
+ void *data)
+{ return 0; }
+static inline void lprocfs_remove(struct proc_dir_entry **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+ struct proc_dir_entry *parent)
+{ return; }
+static inline int lprocfs_obd_setup(struct obd_device *dev,
+ struct lprocfs_vars *list)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_rd_u64(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_name(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_import(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{ return 0; }
+static inline int lprocfs_rd_state(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{ return 0; }
+extern inline int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(struct seq_file *m,
+ struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file,
+ const char __user *buffer,
+ unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_evict_client(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_ping(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_import(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_pinger_recov(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+ struct lprocfs_counter *cnt)
+{ return; }
+static inline
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+ enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* CONFIG_PROC_FS */
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
new file mode 100644
index 000000000..c8cc48f00
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -0,0 +1,1340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+#include "lu_ref.h"
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ * Server side object is split into layers, one per device in the
+ * corresponding device stack. Individual layer is represented by struct
+ * lu_object. Compound layered object --- by struct lu_object_header. Most
+ * interface functions take lu_object as an argument and operate on the
+ * whole compound object. This decision was made due to the following
+ * reasons:
+ *
+ * - it's envisaged that lu_object will be used much more often than
+ * lu_object_header;
+ *
+ * - we want lower (non-top) layers to be able to initiate operations
+ * on the whole object.
+ *
+ * Generic code supports layering more complex than simple stacking, e.g.,
+ * it is possible that at some layer object "spawns" multiple sub-objects
+ * on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ * Compound object is uniquely identified by its fid. Objects are indexed
+ * by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ * Object's life-time is controlled by reference counting. When reference
+ * count drops to 0, object is returned to cache. Cached objects still
+ * retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ * Objects are kept in the global LRU list, and lu_site_purge() function
+ * can be used to reclaim given number of unused objects from the tail of
+ * the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ * Generic code tries to replace recursion through layers by iterations
+ * where possible. Additionally to the end of reducing stack consumption,
+ * data, when practically possible, are allocated through lu_context_key
+ * interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+ /**
+ * Allocate object for the given device (without lower-layer
+ * parts). This is called by lu_object_operations::loo_object_init()
+ * from the parent layer, and should setup at least lu_object::lo_dev
+ * and lu_object::lo_ops fields of resulting lu_object.
+ *
+ * Object creation protocol.
+ *
+ * Due to design goal of avoiding recursion, object creation (see
+ * lu_object_alloc()) is somewhat involved:
+ *
+ * - first, lu_device_operations::ldo_object_alloc() method of the
+ * top-level device in the stack is called. It should allocate top
+ * level object (including lu_object_header), but without any
+ * lower-layer sub-object(s).
+ *
+ * - then lu_object_alloc() sets fid in the header of newly created
+ * object.
+ *
+ * - then lu_object_operations::loo_object_init() is called. It has
+ * to allocate lower-layer object(s). To do this,
+ * lu_object_operations::loo_object_init() calls ldo_object_alloc()
+ * of the lower-layer device(s).
+ *
+ * - for all new objects allocated by
+ * lu_object_operations::loo_object_init() (and inserted into object
+ * stack), lu_object_operations::loo_object_init() is called again
+ * repeatedly, until no new objects are created.
+ *
+ * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+ * result->lo_ops != NULL);
+ */
+ struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+ const struct lu_object_header *h,
+ struct lu_device *d);
+ /**
+ * process config specific for device.
+ */
+ int (*ldo_process_config)(const struct lu_env *env,
+ struct lu_device *, struct lustre_cfg *);
+ int (*ldo_recovery_complete)(const struct lu_env *,
+ struct lu_device *);
+
+ /**
+ * initialize local objects for device. this method called after layer has
+ * been initialized (after LCFG_SETUP stage) and before it starts serving
+ * user requests.
+ */
+
+ int (*ldo_prepare)(const struct lu_env *,
+ struct lu_device *parent,
+ struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+ /* This is a new object to be allocated, or the file
+ * corresponding to the object does not exists. */
+ LOC_F_NEW = 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+ /**
+ * Some hints for obj find and alloc.
+ */
+ loc_flags_t loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+ void *cookie, const char *format, ...)
+ __printf(3, 4);
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+ /**
+ * Allocate lower-layer parts of the object by calling
+ * lu_device_operations::ldo_object_alloc() of the corresponding
+ * underlying device.
+ *
+ * This method is called once for each object inserted into object
+ * stack. It's responsibility of this method to insert lower-layer
+ * object(s) it create into appropriate places of object stack.
+ */
+ int (*loo_object_init)(const struct lu_env *env,
+ struct lu_object *o,
+ const struct lu_object_conf *conf);
+ /**
+ * Called (in top-to-bottom order) during object allocation after all
+ * layers were allocated and initialized. Can be used to perform
+ * initialization depending on lower layers.
+ */
+ int (*loo_object_start)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Called before lu_object_operations::loo_object_free() to signal
+ * that object is being destroyed. Dual to
+ * lu_object_operations::loo_object_init().
+ */
+ void (*loo_object_delete)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Dual to lu_device_operations::ldo_object_alloc(). Called when
+ * object is removed from memory.
+ */
+ void (*loo_object_free)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Called when last active reference to the object is released (and
+ * object returns to the cache). This method is optional.
+ */
+ void (*loo_object_release)(const struct lu_env *env,
+ struct lu_object *o);
+ /**
+ * Optional debugging helper. Print given object.
+ */
+ int (*loo_object_print)(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o);
+ /**
+ * Optional debugging method. Returns true iff method is internally
+ * consistent.
+ */
+ int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+ /**
+ * reference count. This is incremented, in particular, on each object
+ * created at this layer.
+ *
+ * \todo XXX which means that atomic_t is probably too small.
+ */
+ atomic_t ld_ref;
+ /**
+ * Pointer to device type. Never modified once set.
+ */
+ struct lu_device_type *ld_type;
+ /**
+ * Operation vector for this device.
+ */
+ const struct lu_device_operations *ld_ops;
+ /**
+ * Stack this device belongs to.
+ */
+ struct lu_site *ld_site;
+ struct proc_dir_entry *ld_proc_entry;
+
+ /** \todo XXX: temporary back pointer into obd. */
+ struct obd_device *ld_obd;
+ /**
+ * A list of references to this object, for debugging.
+ */
+ struct lu_ref ld_reference;
+ /**
+ * Link the device to the site.
+ **/
+ struct list_head ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+ /** this is meta-data device */
+ LU_DEVICE_MD = (1 << 0),
+ /** this is data device */
+ LU_DEVICE_DT = (1 << 1),
+ /** data device in the client stack */
+ LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+ /**
+ * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+ */
+ __u32 ldt_tags;
+ /**
+ * Name of this class. Unique system-wide. Never modified once set.
+ */
+ char *ldt_name;
+ /**
+ * Operations for this type.
+ */
+ const struct lu_device_type_operations *ldt_ops;
+ /**
+ * \todo XXX: temporary pointer to associated obd_type.
+ */
+ struct obd_type *ldt_obd_type;
+ /**
+ * \todo XXX: temporary: context tags used by obd_*() calls.
+ */
+ __u32 ldt_ctx_tags;
+ /**
+ * Number of existing device type instances.
+ */
+ unsigned ldt_device_nr;
+ /**
+ * Linkage into a global list of all device types.
+ *
+ * \see lu_device_types.
+ */
+ struct list_head ldt_linkage;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+ /**
+ * Allocate new device.
+ */
+ struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *lcfg);
+ /**
+ * Free device. Dual to
+ * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+ * the next device in the stack.
+ */
+ struct lu_device *(*ldto_device_free)(const struct lu_env *,
+ struct lu_device *);
+
+ /**
+ * Initialize the devices after allocation
+ */
+ int (*ldto_device_init)(const struct lu_env *env,
+ struct lu_device *, const char *,
+ struct lu_device *);
+ /**
+ * Finalize device. Dual to
+ * lu_device_type_operations::ldto_device_init(). Returns pointer to
+ * the next device in the stack.
+ */
+ struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+ struct lu_device *);
+ /**
+ * Initialize device type. This is called on module load.
+ */
+ int (*ldto_init)(struct lu_device_type *t);
+ /**
+ * Finalize device type. Dual to
+ * lu_device_type_operations::ldto_init(). Called on module unload.
+ */
+ void (*ldto_fini)(struct lu_device_type *t);
+ /**
+ * Called when the first device is created.
+ */
+ void (*ldto_start)(struct lu_device_type *t);
+ /**
+ * Called when number of devices drops to 0.
+ */
+ void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+ return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+ /** size in bytes */
+ __u64 la_size;
+ /** modification time in seconds since Epoch */
+ s64 la_mtime;
+ /** access time in seconds since Epoch */
+ s64 la_atime;
+ /** change time in seconds since Epoch */
+ s64 la_ctime;
+ /** 512-byte blocks allocated to object */
+ __u64 la_blocks;
+ /** permission bits and file type */
+ __u32 la_mode;
+ /** owner id */
+ __u32 la_uid;
+ /** group id */
+ __u32 la_gid;
+ /** object flags */
+ __u32 la_flags;
+ /** number of persistent references to this object */
+ __u32 la_nlink;
+ /** blk bits of the object*/
+ __u32 la_blkbits;
+ /** blk size of the object*/
+ __u32 la_blksize;
+ /** real device */
+ __u32 la_rdev;
+ /**
+ * valid bits
+ *
+ * \see enum la_valid
+ */
+ __u64 la_valid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+ LA_ATIME = 1 << 0,
+ LA_MTIME = 1 << 1,
+ LA_CTIME = 1 << 2,
+ LA_SIZE = 1 << 3,
+ LA_MODE = 1 << 4,
+ LA_UID = 1 << 5,
+ LA_GID = 1 << 6,
+ LA_BLOCKS = 1 << 7,
+ LA_TYPE = 1 << 8,
+ LA_FLAGS = 1 << 9,
+ LA_NLINK = 1 << 10,
+ LA_RDEV = 1 << 11,
+ LA_BLKSIZE = 1 << 12,
+ LA_KILL_SUID = 1 << 13,
+ LA_KILL_SGID = 1 << 14,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+ /**
+ * Header for this object.
+ */
+ struct lu_object_header *lo_header;
+ /**
+ * Device for this layer.
+ */
+ struct lu_device *lo_dev;
+ /**
+ * Operations for this object.
+ */
+ const struct lu_object_operations *lo_ops;
+ /**
+ * Linkage into list of all layers.
+ */
+ struct list_head lo_linkage;
+ /**
+ * Link to the device, for debugging.
+ */
+ struct lu_ref_link lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+ /**
+ * Don't keep this object in cache. Object will be destroyed as soon
+ * as last reference to it is released. This flag cannot be cleared
+ * once set.
+ */
+ LU_OBJECT_HEARD_BANSHEE = 0,
+ /**
+ * Mark this object has already been taken out of cache.
+ */
+ LU_OBJECT_UNHASHED = 1
+};
+
+enum lu_object_header_attr {
+ LOHA_EXISTS = 1 << 0,
+ LOHA_REMOTE = 1 << 1,
+ /**
+ * UNIX file type is stored in S_IFMT bits.
+ */
+ LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+ LOHA_FT_END = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+ /**
+ * Fid, uniquely identifying this object.
+ */
+ struct lu_fid loh_fid;
+ /**
+ * Object flags from enum lu_object_header_flags. Set and checked
+ * atomically.
+ */
+ unsigned long loh_flags;
+ /**
+ * Object reference count. Protected by lu_site::ls_guard.
+ */
+ atomic_t loh_ref;
+ /**
+ * Common object attributes, cached for efficiency. From enum
+ * lu_object_header_attr.
+ */
+ __u32 loh_attr;
+ /**
+ * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+ */
+ struct hlist_node loh_hash;
+ /**
+ * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+ */
+ struct list_head loh_lru;
+ /**
+ * Linkage into list of layers. Never modified once set (except lately
+ * during object destruction). No locking is necessary.
+ */
+ struct list_head loh_layers;
+ /**
+ * A list of references to this object, for debugging.
+ */
+ struct lu_ref loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+ /**
+ * number of busy object on this bucket
+ */
+ long lsb_busy;
+ /**
+ * LRU list, updated on each access to object. Protected by
+ * bucket lock of lu_site::ls_obj_hash.
+ *
+ * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+ * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+ * of list_for_each_entry_safe_reverse()).
+ */
+ struct list_head lsb_lru;
+ /**
+ * Wait-queue signaled when an object in this site is ultimately
+ * destroyed (lu_object_free()). It is used by lu_object_find() to
+ * wait before re-trying when object in the process of destruction is
+ * found in the hash table.
+ *
+ * \see htable_lookup().
+ */
+ wait_queue_head_t lsb_marche_funebre;
+};
+
+enum {
+ LU_SS_CREATED = 0,
+ LU_SS_CACHE_HIT,
+ LU_SS_CACHE_MISS,
+ LU_SS_CACHE_RACE,
+ LU_SS_CACHE_DEATH_RACE,
+ LU_SS_LRU_PURGED,
+ LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+ /**
+ * objects hash table
+ */
+ struct cfs_hash *ls_obj_hash;
+ /**
+ * index of bucket on hash table while purging
+ */
+ int ls_purge_start;
+ /**
+ * Top-level device for this stack.
+ */
+ struct lu_device *ls_top_dev;
+ /**
+ * Bottom-level device for this stack
+ */
+ struct lu_device *ls_bottom_dev;
+ /**
+ * Linkage into global list of sites.
+ */
+ struct list_head ls_linkage;
+ /**
+ * List for lu device for this site, protected
+ * by ls_ld_lock.
+ **/
+ struct list_head ls_ld_linkage;
+ spinlock_t ls_ld_lock;
+
+ /**
+ * lu_site stats
+ */
+ struct lprocfs_stats *ls_stats;
+ /**
+ * XXX: a hack! fld has to find md_site via site, remove when possible
+ */
+ struct seq_server_site *ld_seq_site;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+ struct cfs_hash_bd bd;
+
+ cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+ return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+ return s->ld_seq_site;
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int lu_site_init (struct lu_site *s, struct lu_device *d);
+void lu_site_fini (struct lu_site *s);
+int lu_site_init_finish (struct lu_site *s);
+void lu_stack_fini (const struct lu_env *env, struct lu_device *top);
+void lu_device_get (struct lu_device *d);
+void lu_device_put (struct lu_device *d);
+int lu_device_init (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini (struct lu_device *d);
+int lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int lu_object_init (struct lu_object *o,
+ struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini (struct lu_object *o);
+void lu_object_add_top (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add (struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+void lu_types_stop(void);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+ LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+ atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true of object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+ return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+ lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+ struct lu_device *dev, const struct lu_fid *f,
+ const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+ LASSERT(!list_empty(&h->loh_layers));
+ return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+ return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+ return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+ return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+ const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+ void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...) \
+do { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
+ \
+ if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
+ lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+ (object)->lo_header); \
+ lu_cdebug_printer(env, &msgdata, "\n"); \
+ CDEBUG(mask, format , ## __VA_ARGS__); \
+ } \
+} while (0)
+
+void lu_object_print (const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+ return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+ return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+ LASSERT(lu_object_exists(o) != 0);
+ return o->lo_header->loh_attr;
+}
+
+static inline void lu_object_ref_add(struct lu_object *o,
+ const char *scope,
+ const void *source)
+{
+ lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_add_at(struct lu_object *o,
+ struct lu_ref_link *link,
+ const char *scope,
+ const void *source)
+{
+ lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+ const char *scope, const void *source)
+{
+ lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+ struct lu_ref_link *link,
+ const char *scope, const void *source)
+{
+ lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+ /** hash */
+ __u64 rp_hash;
+ /** count in bytes */
+ unsigned int rp_count;
+ /** number of pages */
+ unsigned int rp_npages;
+ /** requested attr */
+ __u32 rp_attrs;
+ /** pointers to pages */
+ struct page **rp_pages;
+};
+
+enum lu_xattr_flags {
+ LU_XATTR_REPLACE = (1 << 0),
+ LU_XATTR_CREATE = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+ LCS_INITIALIZED = 1,
+ LCS_ENTERED,
+ LCS_LEFT,
+ LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+ /**
+ * lu_context is used on the client side too. Yet we don't want to
+ * allocate values of server-side keys for the client contexts and
+ * vice versa.
+ *
+ * To achieve this, set of tags in introduced. Contexts and keys are
+ * marked with tags. Key value are created only for context whose set
+ * of tags has non-empty intersection with one for key. Tags are taken
+ * from enum lu_context_tag.
+ */
+ __u32 lc_tags;
+ enum lu_context_state lc_state;
+ /**
+ * Pointer to the home service thread. NULL for other execution
+ * contexts.
+ */
+ struct ptlrpc_thread *lc_thread;
+ /**
+ * Pointer to an array with key values. Internal implementation
+ * detail.
+ */
+ void **lc_value;
+ /**
+ * Linkage into a list of all remembered contexts. Only
+ * `non-transient' contexts, i.e., ones created for service threads
+ * are placed here.
+ */
+ struct list_head lc_remember;
+ /**
+ * Version counter used to skip calls to lu_context_refill() when no
+ * keys were registered.
+ */
+ unsigned lc_version;
+ /**
+ * Debugging cookie.
+ */
+ unsigned lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+ /**
+ * Thread on md server
+ */
+ LCT_MD_THREAD = 1 << 0,
+ /**
+ * Thread on dt server
+ */
+ LCT_DT_THREAD = 1 << 1,
+ /**
+ * Context for transaction handle
+ */
+ LCT_TX_HANDLE = 1 << 2,
+ /**
+ * Thread on client
+ */
+ LCT_CL_THREAD = 1 << 3,
+ /**
+ * A per-request session on a server, and a per-system-call session on
+ * a client.
+ */
+ LCT_SESSION = 1 << 4,
+ /**
+ * A per-request data on OSP device
+ */
+ LCT_OSP_THREAD = 1 << 5,
+ /**
+ * MGS device thread
+ */
+ LCT_MG_THREAD = 1 << 6,
+ /**
+ * Context for local operations
+ */
+ LCT_LOCAL = 1 << 7,
+ /**
+ * Set when at least one of keys, having values in this context has
+ * non-NULL lu_context_key::lct_exit() method. This is used to
+ * optimize lu_context_exit() call.
+ */
+ LCT_HAS_EXIT = 1 << 28,
+ /**
+ * Don't add references for modules creating key values in that context.
+ * This is only for contexts used internally by lu_object framework.
+ */
+ LCT_NOREF = 1 << 29,
+ /**
+ * Key is being prepared for retiring, don't create new values for it.
+ */
+ LCT_QUIESCENT = 1 << 30,
+ /**
+ * Context should be remembered.
+ */
+ LCT_REMEMBER = 1 << 31,
+ /**
+ * Contexts usable in cache shrinker thread.
+ */
+ LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ * preventing new key values from being allocated in the new contexts,
+ * and
+ *
+ * - scans a list of remembered contexts, destroying values of module
+ * keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+ /**
+ * Set of tags for which values of this key are to be instantiated.
+ */
+ __u32 lct_tags;
+ /**
+ * Value constructor. This is called when new value is created for a
+ * context. Returns pointer to new value of error pointer.
+ */
+ void *(*lct_init)(const struct lu_context *ctx,
+ struct lu_context_key *key);
+ /**
+ * Value destructor. Called when context with previously allocated
+ * value of this slot is destroyed. \a data is a value that was returned
+ * by a matching call to lu_context_key::lct_init().
+ */
+ void (*lct_fini)(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+ /**
+ * Optional method called on lu_context_exit() for all allocated
+ * keys. Can be used by debugging code checking that locks are
+ * released, etc.
+ */
+ void (*lct_exit)(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data);
+ /**
+ * Internal implementation detail: index within lu_context::lc_value[]
+ * reserved for this key.
+ */
+ int lct_index;
+ /**
+ * Internal implementation detail: number of values created for this
+ * key.
+ */
+ atomic_t lct_used;
+ /**
+ * Internal implementation detail: module for this key.
+ */
+ struct module *lct_owner;
+ /**
+ * References to this key. For debugging.
+ */
+ struct lu_ref lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type) \
+ static void *mod##_key_init(const struct lu_context *ctx, \
+ struct lu_context_key *key) \
+ { \
+ type *value; \
+ \
+ CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value)); \
+ \
+ OBD_ALLOC_PTR(value); \
+ if (value == NULL) \
+ value = ERR_PTR(-ENOMEM); \
+ \
+ return value; \
+ } \
+ struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type) \
+ static void mod##_key_fini(const struct lu_context *ctx, \
+ struct lu_context_key *key, void *data) \
+ { \
+ type *info = data; \
+ \
+ OBD_FREE_PTR(info); \
+ } \
+ struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type) \
+ LU_KEY_INIT(mod, type); \
+ LU_KEY_FINI(mod, type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags) \
+ struct lu_context_key mod##_thread_key = { \
+ .lct_tags = tags, \
+ .lct_init = mod##_key_init, \
+ .lct_fini = mod##_key_fini \
+ }
+
+#define LU_CONTEXT_KEY_INIT(key) \
+do { \
+ (key)->lct_owner = THIS_MODULE; \
+} while (0)
+
+int lu_context_key_register(struct lu_context_key *key);
+void lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get (const struct lu_context *ctx,
+ const struct lu_context_key *key);
+void lu_context_key_quiesce (struct lu_context_key *key);
+void lu_context_key_revive (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod) \
+ static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+ { \
+ struct lu_context_key *key = k; \
+ va_list args; \
+ \
+ va_start(args, k); \
+ do { \
+ LU_CONTEXT_KEY_INIT(key); \
+ key = va_arg(args, struct lu_context_key *); \
+ } while (key != NULL); \
+ va_end(args); \
+ }
+
+#define LU_TYPE_INIT(mod, ...) \
+ LU_KEY_INIT_GENERIC(mod) \
+ static int mod##_type_init(struct lu_device_type *t) \
+ { \
+ mod##_key_init_generic(__VA_ARGS__, NULL); \
+ return lu_context_key_register_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...) \
+ static void mod##_type_fini(struct lu_device_type *t) \
+ { \
+ lu_context_key_degister_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...) \
+ static void mod##_type_start(struct lu_device_type *t) \
+ { \
+ lu_context_key_revive_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...) \
+ static void mod##_type_stop(struct lu_device_type *t) \
+ { \
+ lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+ } \
+ struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...) \
+ LU_TYPE_INIT(mod, __VA_ARGS__); \
+ LU_TYPE_FINI(mod, __VA_ARGS__); \
+ LU_TYPE_START(mod, __VA_ARGS__); \
+ LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int lu_context_init (struct lu_context *ctx, __u32 tags);
+void lu_context_fini (struct lu_context *ctx);
+void lu_context_enter (struct lu_context *ctx);
+void lu_context_exit (struct lu_context *ctx);
+int lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+ /**
+ * "Local" context, used to store data instead of stack.
+ */
+ struct lu_context le_ctx;
+ /**
+ * "Session" context for per-request data.
+ */
+ struct lu_context *le_ses;
+};
+
+int lu_env_init (struct lu_env *env, __u32 tags);
+void lu_env_fini (struct lu_env *env);
+int lu_env_refill(struct lu_env *env);
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+ const char *ln_name;
+ int ln_namelen;
+};
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+ void *lb_buf;
+ ssize_t lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+ struct kmem_cache **ckd_cache;
+ const char *ckd_name;
+ const size_t ckd_size;
+};
+
+int lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+ const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, int size);
+void lu_buf_realloc(struct lu_buf *buf, int size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, int len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len);
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h
new file mode 100644
index 000000000..b451a888c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lu_ref.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ * struct foo {
+ * atomic_t foo_refcount;
+ * struct lu_ref foo_reference;
+ * ...
+ * };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ * struct bar *bar;
+ *
+ * // bar owns a reference to foo.
+ * bar->bar_foo = foo_get(foo);
+ * lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ * ...
+ *
+ * // reference from bar to foo is released.
+ * lu_ref_del(&foo->foo_reference, "bar", bar);
+ * foo_put(bar->bar_foo);
+ *
+ *
+ * // current thread acquired a temporary reference to foo.
+ * foo_get(foo);
+ * lu_ref_add(&foo->reference, __func__, current);
+ *
+ * ...
+ *
+ * // temporary reference is released.
+ * lu_ref_del(&foo->reference, __func__, current);
+ * foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ * // There is a large number of bar's for a single foo.
+ * bar->bar_foo = foo_get(foo);
+ * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ * ...
+ *
+ * // reference from bar to foo is released.
+ * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ * foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+
+/*
+ * dummy data structures/functions to pass compile for now.
+ * We need to reimplement them with kref.
+ */
+struct lu_ref {};
+struct lu_ref_link {};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref,
+ const char *scope,
+ const void *source)
+{
+ return NULL;
+}
+
+static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref,
+ const char *scope,
+ const void *source)
+{
+ return NULL;
+}
+
+static inline void lu_ref_add_at(struct lu_ref *ref,
+ struct lu_ref_link *link,
+ const char *scope,
+ const void *source)
+{
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+ const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+ const char *scope, const void *source0,
+ const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+ const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+ return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h
new file mode 100644
index 000000000..e8e0b084a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/libiam.h
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ * lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+ FMT_LFIX,
+ FMT_LVAR
+};
+
+struct iam_uapi_info {
+ __u16 iui_keysize;
+ __u16 iui_recsize;
+ __u16 iui_ptrsize;
+ __u16 iui_height;
+ char iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+ int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open.
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *keybuf,
+ int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *key_buf,
+ int *keysize, char *save_key,
+ int rec_need_convert, char *rec_buf,
+ int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *keybuf,
+ int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *key_buf,
+ int *keysize, char *save_key,
+ int rec_need_convert, char *rec_buf,
+ int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *key_buf,
+ int *keysize, char *save_key,
+ int rec_need_convert, char *rec_buf,
+ int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+ int key_need_convert, char *keybuf,
+ int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
new file mode 100644
index 000000000..ad253c6de
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+
+
+struct ll_fiemap_extent {
+ __u64 fe_logical; /* logical offset in bytes for the start of
+ * the extent from the beginning of the file */
+ __u64 fe_physical; /* physical offset in bytes for the start
+ * of the extent from the beginning of the disk */
+ __u64 fe_length; /* length in bytes for this extent */
+ __u64 fe_reserved64[2];
+ __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
+ __u32 fe_device; /* device number for this extent */
+ __u32 fe_reserved[2];
+};
+
+struct ll_user_fiemap {
+ __u64 fm_start; /* logical offset (inclusive) at
+ * which to start mapping (in) */
+ __u64 fm_length; /* logical length of mapping which
+ * userspace wants (in) */
+ __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
+ __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+ __u32 fm_extent_count; /* size of fm_extents array (in) */
+ __u32 fm_reserved;
+ struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET (~0ULL)
+
+#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.
+ * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read
+ * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs.
+ * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be
+ * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block.
+ * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but
+ * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively
+ * support extents. Result
+ * merged for efficiency. */
+
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+ return (sizeof(struct ll_user_fiemap) + extent_count *
+ sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+ return ((array_size - sizeof(struct ll_user_fiemap)) /
+ sizeof(struct ll_fiemap_extent));
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely.
+ * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
new file mode 100644
index 000000000..93a3d7db3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
@@ -0,0 +1,2 @@
+#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0"
+#define LUSTRE_RELEASE 3.9.0_g6e62c21
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h b/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h
new file mode 100644
index 000000000..35aefa2cd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_errno.h
@@ -0,0 +1,215 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#ifndef LUSTRE_ERRNO_H
+#define LUSTRE_ERRNO_H
+
+/*
+ * Only "network" errnos, which are defined below, are allowed on wire (or on
+ * disk). Generic routines exist to help translate between these and a subset
+ * of the "host" errnos. Some host errnos (e.g., EDEADLOCK) are intentionally
+ * left out. See also the comment on lustre_errno_hton_mapping[].
+ *
+ * To maintain compatibility with existing x86 clients and servers, each of
+ * these network errnos has the same numerical value as its corresponding host
+ * errno on x86.
+ */
+#define LUSTRE_EPERM 1 /* Operation not permitted */
+#define LUSTRE_ENOENT 2 /* No such file or directory */
+#define LUSTRE_ESRCH 3 /* No such process */
+#define LUSTRE_EINTR 4 /* Interrupted system call */
+#define LUSTRE_EIO 5 /* I/O error */
+#define LUSTRE_ENXIO 6 /* No such device or address */
+#define LUSTRE_E2BIG 7 /* Argument list too long */
+#define LUSTRE_ENOEXEC 8 /* Exec format error */
+#define LUSTRE_EBADF 9 /* Bad file number */
+#define LUSTRE_ECHILD 10 /* No child processes */
+#define LUSTRE_EAGAIN 11 /* Try again */
+#define LUSTRE_ENOMEM 12 /* Out of memory */
+#define LUSTRE_EACCES 13 /* Permission denied */
+#define LUSTRE_EFAULT 14 /* Bad address */
+#define LUSTRE_ENOTBLK 15 /* Block device required */
+#define LUSTRE_EBUSY 16 /* Device or resource busy */
+#define LUSTRE_EEXIST 17 /* File exists */
+#define LUSTRE_EXDEV 18 /* Cross-device link */
+#define LUSTRE_ENODEV 19 /* No such device */
+#define LUSTRE_ENOTDIR 20 /* Not a directory */
+#define LUSTRE_EISDIR 21 /* Is a directory */
+#define LUSTRE_EINVAL 22 /* Invalid argument */
+#define LUSTRE_ENFILE 23 /* File table overflow */
+#define LUSTRE_EMFILE 24 /* Too many open files */
+#define LUSTRE_ENOTTY 25 /* Not a typewriter */
+#define LUSTRE_ETXTBSY 26 /* Text file busy */
+#define LUSTRE_EFBIG 27 /* File too large */
+#define LUSTRE_ENOSPC 28 /* No space left on device */
+#define LUSTRE_ESPIPE 29 /* Illegal seek */
+#define LUSTRE_EROFS 30 /* Read-only file system */
+#define LUSTRE_EMLINK 31 /* Too many links */
+#define LUSTRE_EPIPE 32 /* Broken pipe */
+#define LUSTRE_EDOM 33 /* Math argument out of domain of
+ func */
+#define LUSTRE_ERANGE 34 /* Math result not representable */
+#define LUSTRE_EDEADLK 35 /* Resource deadlock would occur */
+#define LUSTRE_ENAMETOOLONG 36 /* File name too long */
+#define LUSTRE_ENOLCK 37 /* No record locks available */
+#define LUSTRE_ENOSYS 38 /* Function not implemented */
+#define LUSTRE_ENOTEMPTY 39 /* Directory not empty */
+#define LUSTRE_ELOOP 40 /* Too many symbolic links
+ encountered */
+#define LUSTRE_ENOMSG 42 /* No message of desired type */
+#define LUSTRE_EIDRM 43 /* Identifier removed */
+#define LUSTRE_ECHRNG 44 /* Channel number out of range */
+#define LUSTRE_EL2NSYNC 45 /* Level 2 not synchronized */
+#define LUSTRE_EL3HLT 46 /* Level 3 halted */
+#define LUSTRE_EL3RST 47 /* Level 3 reset */
+#define LUSTRE_ELNRNG 48 /* Link number out of range */
+#define LUSTRE_EUNATCH 49 /* Protocol driver not attached */
+#define LUSTRE_ENOCSI 50 /* No CSI structure available */
+#define LUSTRE_EL2HLT 51 /* Level 2 halted */
+#define LUSTRE_EBADE 52 /* Invalid exchange */
+#define LUSTRE_EBADR 53 /* Invalid request descriptor */
+#define LUSTRE_EXFULL 54 /* Exchange full */
+#define LUSTRE_ENOANO 55 /* No anode */
+#define LUSTRE_EBADRQC 56 /* Invalid request code */
+#define LUSTRE_EBADSLT 57 /* Invalid slot */
+#define LUSTRE_EBFONT 59 /* Bad font file format */
+#define LUSTRE_ENOSTR 60 /* Device not a stream */
+#define LUSTRE_ENODATA 61 /* No data available */
+#define LUSTRE_ETIME 62 /* Timer expired */
+#define LUSTRE_ENOSR 63 /* Out of streams resources */
+#define LUSTRE_ENONET 64 /* Machine is not on the network */
+#define LUSTRE_ENOPKG 65 /* Package not installed */
+#define LUSTRE_EREMOTE 66 /* Object is remote */
+#define LUSTRE_ENOLINK 67 /* Link has been severed */
+#define LUSTRE_EADV 68 /* Advertise error */
+#define LUSTRE_ESRMNT 69 /* Srmount error */
+#define LUSTRE_ECOMM 70 /* Communication error on send */
+#define LUSTRE_EPROTO 71 /* Protocol error */
+#define LUSTRE_EMULTIHOP 72 /* Multihop attempted */
+#define LUSTRE_EDOTDOT 73 /* RFS specific error */
+#define LUSTRE_EBADMSG 74 /* Not a data message */
+#define LUSTRE_EOVERFLOW 75 /* Value too large for defined data
+ type */
+#define LUSTRE_ENOTUNIQ 76 /* Name not unique on network */
+#define LUSTRE_EBADFD 77 /* File descriptor in bad state */
+#define LUSTRE_EREMCHG 78 /* Remote address changed */
+#define LUSTRE_ELIBACC 79 /* Can not access a needed shared
+ library */
+#define LUSTRE_ELIBBAD 80 /* Accessing a corrupted shared
+ library */
+#define LUSTRE_ELIBSCN 81 /* .lib section in a.out corrupted */
+#define LUSTRE_ELIBMAX 82 /* Attempting to link in too many shared
+ libraries */
+#define LUSTRE_ELIBEXEC 83 /* Cannot exec a shared library
+ directly */
+#define LUSTRE_EILSEQ 84 /* Illegal byte sequence */
+#define LUSTRE_ERESTART 85 /* Interrupted system call should be
+ restarted */
+#define LUSTRE_ESTRPIPE 86 /* Streams pipe error */
+#define LUSTRE_EUSERS 87 /* Too many users */
+#define LUSTRE_ENOTSOCK 88 /* Socket operation on non-socket */
+#define LUSTRE_EDESTADDRREQ 89 /* Destination address required */
+#define LUSTRE_EMSGSIZE 90 /* Message too long */
+#define LUSTRE_EPROTOTYPE 91 /* Protocol wrong type for socket */
+#define LUSTRE_ENOPROTOOPT 92 /* Protocol not available */
+#define LUSTRE_EPROTONOSUPPORT 93 /* Protocol not supported */
+#define LUSTRE_ESOCKTNOSUPPORT 94 /* Socket type not supported */
+#define LUSTRE_EOPNOTSUPP 95 /* Operation not supported on transport
+ endpoint */
+#define LUSTRE_EPFNOSUPPORT 96 /* Protocol family not supported */
+#define LUSTRE_EAFNOSUPPORT 97 /* Address family not supported by
+ protocol */
+#define LUSTRE_EADDRINUSE 98 /* Address already in use */
+#define LUSTRE_EADDRNOTAVAIL 99 /* Cannot assign requested address */
+#define LUSTRE_ENETDOWN 100 /* Network is down */
+#define LUSTRE_ENETUNREACH 101 /* Network is unreachable */
+#define LUSTRE_ENETRESET 102 /* Network dropped connection because of
+ reset */
+#define LUSTRE_ECONNABORTED 103 /* Software caused connection abort */
+#define LUSTRE_ECONNRESET 104 /* Connection reset by peer */
+#define LUSTRE_ENOBUFS 105 /* No buffer space available */
+#define LUSTRE_EISCONN 106 /* Transport endpoint is already
+ connected */
+#define LUSTRE_ENOTCONN 107 /* Transport endpoint is not
+ connected */
+#define LUSTRE_ESHUTDOWN 108 /* Cannot send after transport endpoint
+ shutdown */
+#define LUSTRE_ETOOMANYREFS 109 /* Too many references: cannot splice */
+#define LUSTRE_ETIMEDOUT 110 /* Connection timed out */
+#define LUSTRE_ECONNREFUSED 111 /* Connection refused */
+#define LUSTRE_EHOSTDOWN 112 /* Host is down */
+#define LUSTRE_EHOSTUNREACH 113 /* No route to host */
+#define LUSTRE_EALREADY 114 /* Operation already in progress */
+#define LUSTRE_EINPROGRESS 115 /* Operation now in progress */
+#define LUSTRE_ESTALE 116 /* Stale file handle */
+#define LUSTRE_EUCLEAN 117 /* Structure needs cleaning */
+#define LUSTRE_ENOTNAM 118 /* Not a XENIX named type file */
+#define LUSTRE_ENAVAIL 119 /* No XENIX semaphores available */
+#define LUSTRE_EISNAM 120 /* Is a named type file */
+#define LUSTRE_EREMOTEIO 121 /* Remote I/O error */
+#define LUSTRE_EDQUOT 122 /* Quota exceeded */
+#define LUSTRE_ENOMEDIUM 123 /* No medium found */
+#define LUSTRE_EMEDIUMTYPE 124 /* Wrong medium type */
+#define LUSTRE_ECANCELED 125 /* Operation Canceled */
+#define LUSTRE_ENOKEY 126 /* Required key not available */
+#define LUSTRE_EKEYEXPIRED 127 /* Key has expired */
+#define LUSTRE_EKEYREVOKED 128 /* Key has been revoked */
+#define LUSTRE_EKEYREJECTED 129 /* Key was rejected by service */
+#define LUSTRE_EOWNERDEAD 130 /* Owner died */
+#define LUSTRE_ENOTRECOVERABLE 131 /* State not recoverable */
+#define LUSTRE_ERESTARTSYS 512
+#define LUSTRE_ERESTARTNOINTR 513
+#define LUSTRE_ERESTARTNOHAND 514 /* restart if no handler.. */
+#define LUSTRE_ENOIOCTLCMD 515 /* No ioctl command */
+#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart by calling
+ sys_restart_syscall */
+#define LUSTRE_EBADHANDLE 521 /* Illegal NFS file handle */
+#define LUSTRE_ENOTSYNC 522 /* Update synchronization mismatch */
+#define LUSTRE_EBADCOOKIE 523 /* Cookie is stale */
+#define LUSTRE_ENOTSUPP 524 /* Operation is not supported */
+#define LUSTRE_ETOOSMALL 525 /* Buffer or request is too small */
+#define LUSTRE_ESERVERFAULT 526 /* An untranslatable error occurred */
+#define LUSTRE_EBADTYPE 527 /* Type not supported by server */
+#define LUSTRE_EJUKEBOX 528 /* Request initiated, but will not
+ complete before timeout */
+#define LUSTRE_EIOCBQUEUED 529 /* iocb queued, will get completion
+ event */
+#define LUSTRE_EIOCBRETRY 530 /* iocb queued, will trigger a retry */
+
+/*
+ * Translations are optimized away on x86. Host errnos that shouldn't be put
+ * on wire could leak through as a result. Do not count on this side effect.
+ */
+#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS
+unsigned int lustre_errno_hton(unsigned int h);
+unsigned int lustre_errno_ntoh(unsigned int n);
+#else
+#define lustre_errno_hton(h) (h)
+#define lustre_errno_ntoh(n) (n)
+#endif
+
+#endif /* LUSTRE_ERRNO_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
new file mode 100644
index 000000000..305ecbee9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -0,0 +1,3734 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here. Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file. Similarly, all flags and magic
+ * values in those structs should also be declared here. This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures, and the
+ * prototypes of the swabber functions for each struct. Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary. Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources. This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions. Some structs may have padding fields that
+ * can be used. Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format. The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c. These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument. The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+/* Defn's shared with user-space. */
+#include "lustre_user.h"
+#include "lustre_errno.h"
+
+/*
+ * GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL is for incoming replies on the FOO
+ * FOO_BULK_PORTAL is for incoming bulk on the FOO
+ */
+
+/* Lustre service names are following the format
+ * service name + MDT + seq name
+ */
+#define LUSTRE_MDT_MAXNAMELEN 80
+
+#define CONNMGR_REQUEST_PORTAL 1
+#define CONNMGR_REPLY_PORTAL 2
+//#define OSC_REQUEST_PORTAL 3
+#define OSC_REPLY_PORTAL 4
+//#define OSC_BULK_PORTAL 5
+#define OST_IO_PORTAL 6
+#define OST_CREATE_PORTAL 7
+#define OST_BULK_PORTAL 8
+//#define MDC_REQUEST_PORTAL 9
+#define MDC_REPLY_PORTAL 10
+//#define MDC_BULK_PORTAL 11
+#define MDS_REQUEST_PORTAL 12
+//#define MDS_REPLY_PORTAL 13
+#define MDS_BULK_PORTAL 14
+#define LDLM_CB_REQUEST_PORTAL 15
+#define LDLM_CB_REPLY_PORTAL 16
+#define LDLM_CANCEL_REQUEST_PORTAL 17
+#define LDLM_CANCEL_REPLY_PORTAL 18
+//#define PTLBD_REQUEST_PORTAL 19
+//#define PTLBD_REPLY_PORTAL 20
+//#define PTLBD_BULK_PORTAL 21
+#define MDS_SETATTR_PORTAL 22
+#define MDS_READPAGE_PORTAL 23
+#define OUT_PORTAL 24
+
+#define MGC_REPLY_PORTAL 25
+#define MGS_REQUEST_PORTAL 26
+#define MGS_REPLY_PORTAL 27
+#define OST_REQUEST_PORTAL 28
+#define FLD_REQUEST_PORTAL 29
+#define SEQ_METADATA_PORTAL 30
+#define SEQ_DATA_PORTAL 31
+#define SEQ_CONTROLLER_PORTAL 32
+#define MGS_BULK_PORTAL 33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR 4712
+#define PTL_RPC_MSG_REPLY 4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+
+#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION 0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION 0x00010000
+#define LUSTRE_MDS_VERSION 0x00020000
+#define LUSTRE_OST_VERSION 0x00030000
+#define LUSTRE_DLM_VERSION 0x00040000
+#define LUSTRE_LOG_VERSION 0x00050000
+#define LUSTRE_MGS_VERSION 0x00060000
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+ __u64 lsr_start;
+ __u64 lsr_end;
+ __u32 lsr_index;
+ __u32 lsr_flags;
+};
+
+#define LU_SEQ_RANGE_MDT 0x0
+#define LU_SEQ_RANGE_OST 0x1
+#define LU_SEQ_RANGE_ANY 0x3
+
+#define LU_SEQ_RANGE_MASK 0x3
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+ return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+static inline int fld_range_is_ost(const struct lu_seq_range *range)
+{
+ return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+static inline int fld_range_is_mdt(const struct lu_seq_range *range)
+{
+ return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * This all range is only being used when fld client sends fld query request,
+ * but it does not know whether the seq is MDT or OST, so it will send req
+ * with ALL type, which means either seq type gotten from lookup can be
+ * expected.
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+ return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+ unsigned flags)
+{
+ range->lsr_flags |= flags;
+}
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+ fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+ fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+ fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * returns width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *range)
+{
+ return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+
+static inline void range_init(struct lu_seq_range *range)
+{
+ memset(range, 0, sizeof(*range));
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+
+static inline int range_within(const struct lu_seq_range *range,
+ __u64 s)
+{
+ return s >= range->lsr_start && s < range->lsr_end;
+}
+
+static inline int range_is_sane(const struct lu_seq_range *range)
+{
+ return (range->lsr_end >= range->lsr_start);
+}
+
+static inline int range_is_zero(const struct lu_seq_range *range)
+{
+ return (range->lsr_start == 0 && range->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *range)
+
+{
+ return range_space(range) == 0;
+}
+
+/* return 0 if two range have the same location */
+static inline int range_compare_loc(const struct lu_seq_range *r1,
+ const struct lu_seq_range *r2)
+{
+ return r1->lsr_index != r2->lsr_index ||
+ r1->lsr_flags != r2->lsr_flags;
+}
+
+#define DRANGE "[%#16.16Lx-%#16.16Lx):%x:%s"
+
+#define PRANGE(range) \
+ (range)->lsr_start, \
+ (range)->lsr_end, \
+ (range)->lsr_index, \
+ fld_range_is_mdt(range) ? "mdt" : "ost"
+
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+/**
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
+ * Deprecated since HSM and SOM attributes are now stored in separate on-disk
+ * xattr.
+ */
+enum lma_compat {
+ LMAC_HSM = 0x00000001,
+ LMAC_SOM = 0x00000002,
+ LMAC_NOT_IN_OI = 0x00000004, /* the object does NOT need OI mapping */
+ LMAC_FID_ON_OST = 0x00000008, /* For OST-object, its OI mapping is
+ * under /O/<seq>/d<x>. */
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+ LMAI_RELEASED = 0x00000001, /* file is released */
+ LMAI_AGENT = 0x00000002, /* agent inode */
+ LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object
+ is on the remote MDT */
+};
+#define LMA_INCOMPAT_SUPP (LMAI_AGENT | LMAI_REMOTE_PARENT)
+
+/**
+ * fid constants
+ */
+enum {
+ /** LASTID file has zero OID */
+ LUSTRE_FID_LASTID_OID = 0UL,
+ /** initial fid id value */
+ LUSTRE_FID_INIT_OID = 1UL
+};
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+ return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+ return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+ return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+ memset(fid, 0, sizeof(*fid));
+}
+
+static inline __u64 fid_ver_oid(const struct lu_fid *fid)
+{
+ return ((__u64)fid_ver(fid) << 32 | fid_oid(fid));
+}
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ *
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ */
+enum fid_seq {
+ FID_SEQ_OST_MDT0 = 0,
+ FID_SEQ_LLOG = 1, /* unnamed llogs */
+ FID_SEQ_ECHO = 2,
+ FID_SEQ_OST_MDT1 = 3,
+ FID_SEQ_OST_MAX = 9, /* Max MDT count before OST_on_FID */
+ FID_SEQ_LLOG_NAME = 10, /* named llogs */
+ FID_SEQ_RSVD = 11,
+ FID_SEQ_IGIF = 12,
+ FID_SEQ_IGIF_MAX = 0x0ffffffffULL,
+ FID_SEQ_IDIF = 0x100000000ULL,
+ FID_SEQ_IDIF_MAX = 0x1ffffffffULL,
+ /* Normal FID sequence starts from this value, i.e. 1<<33 */
+ FID_SEQ_START = 0x200000000ULL,
+ /* sequence for local pre-defined FIDs listed in local_oid */
+ FID_SEQ_LOCAL_FILE = 0x200000001ULL,
+ FID_SEQ_DOT_LUSTRE = 0x200000002ULL,
+ /* sequence is used for local named objects FIDs generated
+ * by local_object_storage library */
+ FID_SEQ_LOCAL_NAME = 0x200000003ULL,
+ /* Because current FLD will only cache the fid sequence, instead
+ * of oid on the client side, if the FID needs to be exposed to
+ * clients sides, it needs to make sure all of fids under one
+ * sequence will be located in one MDT. */
+ FID_SEQ_SPECIAL = 0x200000004ULL,
+ FID_SEQ_QUOTA = 0x200000005ULL,
+ FID_SEQ_QUOTA_GLB = 0x200000006ULL,
+ FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */
+ FID_SEQ_NORMAL = 0x200000400ULL,
+ FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS 32
+#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS 48
+#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+ /* Big Filesystem Lock to serialize rename operations */
+ FID_OID_SPECIAL_BFL = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+ FID_OID_DOT_LUSTRE = 1UL,
+ FID_OID_DOT_LUSTRE_OBF = 2UL,
+};
+
+static inline int fid_seq_is_mdt0(__u64 seq)
+{
+ return (seq == FID_SEQ_OST_MDT0);
+}
+
+static inline int fid_seq_is_mdt(const __u64 seq)
+{
+ return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline int fid_seq_is_echo(__u64 seq)
+{
+ return (seq == FID_SEQ_ECHO);
+}
+
+static inline int fid_is_echo(const struct lu_fid *fid)
+{
+ return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline int fid_seq_is_llog(__u64 seq)
+{
+ return (seq == FID_SEQ_LLOG);
+}
+
+static inline int fid_is_llog(const struct lu_fid *fid)
+{
+ /* file with OID == 0 is not llog but contains last oid */
+ return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0;
+}
+
+static inline int fid_seq_is_rsvd(const __u64 seq)
+{
+ return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD);
+};
+
+static inline int fid_seq_is_special(const __u64 seq)
+{
+ return seq == FID_SEQ_SPECIAL;
+};
+
+static inline int fid_seq_is_local_file(const __u64 seq)
+{
+ return seq == FID_SEQ_LOCAL_FILE ||
+ seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline int fid_seq_is_root(const __u64 seq)
+{
+ return seq == FID_SEQ_ROOT;
+}
+
+static inline int fid_seq_is_dot(const __u64 seq)
+{
+ return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline int fid_seq_is_default(const __u64 seq)
+{
+ return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline int fid_is_mdt0(const struct lu_fid *fid)
+{
+ return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+ fid->f_seq = FID_SEQ_ROOT;
+ fid->f_oid = 1;
+ fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false.
+ */
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+ return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+ return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false.
+ */
+static inline int fid_seq_is_idif(const __u64 seq)
+{
+ return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+ return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline int fid_is_local_file(const struct lu_fid *fid)
+{
+ return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline int fid_seq_is_norm(const __u64 seq)
+{
+ return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline int fid_is_norm(const struct lu_fid *fid)
+{
+ return fid_seq_is_norm(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx)
+{
+ return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver)
+{
+ return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+ return (fid_seq(fid) >> 16) & 0xffff;
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_seq(const struct ost_id *ostid)
+{
+ if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+ return FID_SEQ_OST_MDT0;
+
+ if (fid_seq_is_default(ostid->oi.oi_seq))
+ return FID_SEQ_LOV_DEFAULT;
+
+ if (fid_is_idif(&ostid->oi_fid))
+ return FID_SEQ_OST_MDT0;
+
+ return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline __u64 ostid_id(const struct ost_id *ostid)
+{
+ if (fid_seq_is_mdt0(ostid_seq(ostid)))
+ return ostid->oi.oi_id & IDIF_OID_MASK;
+
+ if (fid_is_idif(&ostid->oi_fid))
+ return fid_idif_id(fid_seq(&ostid->oi_fid),
+ fid_oid(&ostid->oi_fid), 0);
+
+ return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+ if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+ oi->oi.oi_seq = seq;
+ } else {
+ oi->oi_fid.f_seq = seq;
+ /* Note: if f_oid + f_ver is zero, we need init it
+ * to be 1, otherwise, ostid_seq will treat this
+ * as old ostid (oi_seq == 0) */
+ if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0)
+ oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+ }
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+ ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+ ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+ ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline void ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ if (oid >= IDIF_MAX_OID) {
+ CERROR("Bad %llu to set "DOSTID"\n",
+ oid, POSTID(oi));
+ return;
+ }
+ oi->oi.oi_id = oid;
+ } else {
+ if (oid > OBIF_MAX_OID) {
+ CERROR("Bad %llu to set "DOSTID"\n",
+ oid, POSTID(oi));
+ return;
+ }
+ oi->oi_fid.f_oid = oid;
+ }
+}
+
+static inline void ostid_inc_id(struct ost_id *oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) {
+ CERROR("Bad inc "DOSTID"\n", POSTID(oi));
+ return;
+ }
+ oi->oi.oi_id++;
+ } else {
+ oi->oi_fid.f_oid++;
+ }
+}
+
+static inline void ostid_dec_id(struct ost_id *oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(oi)))
+ oi->oi.oi_id--;
+ else
+ oi->oi_fid.f_oid--;
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID. This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs. Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged. Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss. For reference see:
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid,
+ __u32 ost_idx)
+{
+ if (ost_idx > 0xffff) {
+ CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid),
+ ost_idx);
+ return -EBADF;
+ }
+
+ if (fid_seq_is_mdt0(ostid_seq(ostid))) {
+ /* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+ * that we map into the IDIF namespace. It allows up to 2^48
+ * objects per OST, as this is the object namespace that has
+ * been in production for years. This can handle create rates
+ * of 1M objects/s/OST for 9 years, or combinations thereof. */
+ if (ostid_id(ostid) >= IDIF_MAX_OID) {
+ CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+ POSTID(ostid), ost_idx);
+ return -EBADF;
+ }
+ fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx);
+ /* truncate to 32 bits by assignment */
+ fid->f_oid = ostid_id(ostid);
+ /* in theory, not currently used */
+ fid->f_ver = ostid_id(ostid) >> 48;
+ } else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ {
+ /* This is either an IDIF object, which identifies objects across
+ * all OSTs, or a regular FID. The IDIF namespace maps legacy
+ * OST objects into the FID namespace. In both cases, we just
+ * pass the FID through, no conversion needed. */
+ if (ostid->oi_fid.f_ver != 0) {
+ CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+ POSTID(ostid), ost_idx);
+ return -EBADF;
+ }
+ *fid = ostid->oi_fid;
+ }
+
+ return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+ if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+ CERROR("bad IGIF, "DFID"\n", PFID(fid));
+ return -EBADF;
+ }
+
+ if (fid_is_idif(fid)) {
+ ostid_set_seq_mdt0(ostid);
+ ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid),
+ fid_ver(fid)));
+ } else {
+ ostid->oi_fid = *fid;
+ }
+
+ return 0;
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline int fid_is_last_id(const struct lu_fid *fid)
+{
+ return (fid_oid(fid) == 0);
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+ return fid_seq(fid);
+}
+
+extern void lustre_swab_ost_id(struct ost_id *oid);
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+ return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+ fid->f_seq = ino;
+ fid->f_oid = gen;
+ fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+ dst->f_seq = cpu_to_le64(fid_seq(src));
+ dst->f_oid = cpu_to_le32(fid_oid(src));
+ dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+ dst->f_seq = le64_to_cpu(fid_seq(src));
+ dst->f_oid = le32_to_cpu(fid_oid(src));
+ dst->f_ver = le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+ dst->f_seq = cpu_to_be64(fid_seq(src));
+ dst->f_oid = cpu_to_be32(fid_oid(src));
+ dst->f_ver = cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+ dst->f_seq = be64_to_cpu(fid_seq(src));
+ dst->f_oid = be32_to_cpu(fid_oid(src));
+ dst->f_ver = be32_to_cpu(fid_ver(src));
+}
+
+static inline int fid_is_sane(const struct lu_fid *fid)
+{
+ return fid != NULL &&
+ ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) ||
+ fid_is_igif(fid) || fid_is_idif(fid) ||
+ fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+ return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+extern void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+
+static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+ return memcmp(f0, f1, sizeof(*f0)) == 0;
+}
+
+#define __diff_normalize(val0, val1) \
+({ \
+ typeof(val0) __val0 = (val0); \
+ typeof(val1) __val1 = (val1); \
+ \
+ (__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1); \
+})
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+ const struct lu_fid *f1)
+{
+ return
+ __diff_normalize(fid_seq(f0), fid_seq(f1)) ?:
+ __diff_normalize(fid_oid(f0), fid_oid(f1)) ?:
+ __diff_normalize(fid_ver(f0), fid_ver(f1));
+}
+
+static inline void ostid_cpu_to_le(const struct ost_id *src_oi,
+ struct ost_id *dst_oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+ dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+ } else {
+ fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+ }
+}
+
+static inline void ostid_le_to_cpu(const struct ost_id *src_oi,
+ struct ost_id *dst_oi)
+{
+ if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+ dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+ } else {
+ fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+ }
+}
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+ LUDA_FID = 0x0001,
+ LUDA_TYPE = 0x0002,
+ LUDA_64BITHASH = 0x0004,
+
+ /* The following attrs are used for MDT internal only,
+ * not visible to client */
+
+ /* Verify the dirent consistency */
+ LUDA_VERIFY = 0x8000,
+ /* Only check but not repair the dirent inconsistency */
+ LUDA_VERIFY_DRYRUN = 0x4000,
+ /* The dirent has been repaired, or to be repaired (dryrun). */
+ LUDA_REPAIR = 0x2000,
+ /* The system is upgraded, has beed or to be repaired (dryrun). */
+ LUDA_UPGRADE = 0x1000,
+ /* Ignore this record, go to next directly. */
+ LUDA_IGNORE = 0x0800,
+};
+
+#define LU_DIRENT_ATTRS_MASK 0xf800
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+ /** valid if LUDA_FID is set. */
+ struct lu_fid lde_fid;
+ /** a unique entry identifier: a hash or an offset. */
+ __u64 lde_hash;
+ /** total record length, including all attributes. */
+ __u16 lde_reclen;
+ /** name length */
+ __u16 lde_namelen;
+ /** optional variable size attributes following this entry.
+ * taken from enum lu_dirent_attrs.
+ */
+ __u32 lde_attrs;
+ /** name is followed by the attributes indicated in ->ldp_attrs, in
+ * their natural order. After the last attribute, padding bytes are
+ * added to make ->lde_reclen a multiple of 8.
+ */
+ char lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+ __u16 lt_type;
+};
+
+#ifndef IFSHIFT
+#define IFSHIFT 12
+#endif
+
+#ifndef IFTODT
+#define IFTODT(type) (((type) & S_IFMT) >> IFSHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype) ((dirtype) << IFSHIFT)
+#endif
+
+
+struct lu_dirpage {
+ __u64 ldp_hash_start;
+ __u64 ldp_hash_end;
+ __u32 ldp_flags;
+ __u32 ldp_pad0;
+ struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+ /**
+ * dirpage contains no entry.
+ */
+ LDF_EMPTY = 1 << 0,
+ /**
+ * last entry's lde_hash equals ldp_hash_end.
+ */
+ LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+ if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+ return NULL;
+ else
+ return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+ struct lu_dirent *next;
+
+ if (le16_to_cpu(ent->lde_reclen) != 0)
+ next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+ else
+ next = NULL;
+
+ return next;
+}
+
+static inline int lu_dirent_calc_size(int namelen, __u16 attr)
+{
+ int size;
+
+ if (attr & LUDA_TYPE) {
+ const unsigned align = sizeof(struct luda_type) - 1;
+ size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+ size += sizeof(struct luda_type);
+ } else
+ size = sizeof(struct lu_dirent) + namelen;
+
+ return (size + 7) & ~7;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+ if (le16_to_cpu(ent->lde_reclen) == 0) {
+ return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
+ le32_to_cpu(ent->lde_attrs));
+ }
+ return le16_to_cpu(ent->lde_reclen);
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_CACHE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+ __u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+ return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(const struct lustre_handle *lh1,
+ const struct lustre_handle *lh2)
+{
+ return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+ struct lustre_handle *src)
+{
+ tgt->cookie = src->cookie;
+}
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT 0x1
+#define MSGHDR_CKSUM_INCOMPAT18 0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+ __u32 lm_bufcount;
+ __u32 lm_secflvr;
+ __u32 lm_magic;
+ __u32 lm_repsize;
+ __u32 lm_cksum;
+ __u32 lm_flags;
+ __u32 lm_padding_2;
+ __u32 lm_padding_3;
+ __u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS 4
+#define JOBSTATS_JOBID_SIZE 32 /* 32 bytes string */
+struct ptlrpc_body_v3 {
+ struct lustre_handle pb_handle;
+ __u32 pb_type;
+ __u32 pb_version;
+ __u32 pb_opc;
+ __u32 pb_status;
+ __u64 pb_last_xid;
+ __u64 pb_last_seen;
+ __u64 pb_last_committed;
+ __u64 pb_transno;
+ __u32 pb_flags;
+ __u32 pb_op_flags;
+ __u32 pb_conn_cnt;
+ __u32 pb_timeout; /* for req, the deadline, for rep, the service est */
+ __u32 pb_service_time; /* for rep, actual service time */
+ __u32 pb_limit;
+ __u64 pb_slv;
+ /* VBR: pre-versions */
+ __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+ /* padding for future needs */
+ __u64 pb_padding[4];
+ char pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+ struct lustre_handle pb_handle;
+ __u32 pb_type;
+ __u32 pb_version;
+ __u32 pb_opc;
+ __u32 pb_status;
+ __u64 pb_last_xid;
+ __u64 pb_last_seen;
+ __u64 pb_last_committed;
+ __u64 pb_transno;
+ __u32 pb_flags;
+ __u32 pb_op_flags;
+ __u32 pb_conn_cnt;
+ __u32 pb_timeout; /* for req, the deadline, for rep, the service est */
+ __u32 pb_service_time; /* for rep, actual service time, also used for
+ net_latency of req */
+ __u32 pb_limit;
+ __u64 pb_slv;
+ /* VBR: pre-versions */
+ __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+ /* padding for future needs */
+ __u64 pb_padding[4];
+};
+
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF 0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF 1
+#define REPLY_REC_OFF 1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF 2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF 31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK 0xffff0000
+#define MSG_OP_FLAG_SHIFT 16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK 0x0000ffff
+#define MSG_LAST_REPLAY 0x0001
+#define MSG_RESENT 0x0002
+#define MSG_REPLAY 0x0004
+/* #define MSG_AT_SUPPORT 0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY 0x0010
+#define MSG_VERSION_REPLAY 0x0020
+#define MSG_REQ_REPLAY_DONE 0x0040
+#define MSG_LOCK_REPLAY_DONE 0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING 0x00000001
+#define MSG_CONNECT_RECONNECT 0x00000002
+#define MSG_CONNECT_REPLAYABLE 0x00000004
+//#define MSG_CONNECT_PEER 0x8
+#define MSG_CONNECT_LIBCLIENT 0x00000010
+#define MSG_CONNECT_INITIAL 0x00000020
+#define MSG_CONNECT_ASYNC 0x00000040
+#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO 0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL 0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW 0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated.
+ *We do not support JOIN FILE
+ *anymore, reserve this flags
+ *just for preventing such bit
+ *to be reused.*/
+#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL 0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits
+ * directory hash */
+#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+ * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+ * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+ * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */
+#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* flock deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/*create stripe disposition*/
+
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch. Please clear any such
+ * changes with senior engineers before starting to use a new flag. Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection. It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed. LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg) \
+ (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+
+#define MDT_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+ OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+ OBD_CONNECT_IBITS | \
+ OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+ OBD_CONNECT_RMT_CLIENT | \
+ OBD_CONNECT_RMT_CLIENT_FORCE | \
+ OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \
+ OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \
+ OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \
+ OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \
+ OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \
+ OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+ OBD_CONNECT_EINPROGRESS | \
+ OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+ OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+ OBD_CONNECT_PINGLESS | OBD_CONNECT_MAX_EASIZE |\
+ OBD_CONNECT_FLOCK_DEAD | \
+ OBD_CONNECT_DISP_STRIPE)
+
+#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+ OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+ LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+ OBD_CONNECT_RMT_CLIENT | \
+ OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \
+ OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+ OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \
+ OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+ OBD_CONNECT_MAX_EASIZE | \
+ OBD_CONNECT_EINPROGRESS | \
+ OBD_CONNECT_JOBSTATS | \
+ OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+ OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+ OBD_CONNECT_PINGLESS)
+#define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+ OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+ OBD_CONNECT_FULL20)
+
+#define OBD_OCD_VERSION(major, minor, patch, fix) (((major)<<24) + \
+ ((minor)<<16) + \
+ ((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version) ((int)(version)&255)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data_v1 {
+ __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+ __u32 ocd_version; /* lustre release version number */
+ __u32 ocd_grant; /* initial cache grant amount (bytes) */
+ __u32 ocd_index; /* LOV index to connect to */
+ __u32 ocd_brw_size; /* Maximum BRW size in bytes, must be 2^n */
+ __u64 ocd_ibits_known; /* inode bits this client understands */
+ __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */
+ __u8 ocd_inodespace; /* log2 of the per-inode space consumption */
+ __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */
+ __u32 ocd_unused; /* also fix lustre_swab_connect */
+ __u64 ocd_transno; /* first transno from client to be replayed */
+ __u32 ocd_group; /* MDS group on OST */
+ __u32 ocd_cksum_types; /* supported checksum algorithms */
+ __u32 ocd_max_easize; /* How big LOV EA can be on MDS */
+ __u32 ocd_instance; /* also fix lustre_swab_connect */
+ __u64 ocd_maxbytes; /* Maximum stripe size in bytes */
+};
+
+struct obd_connect_data {
+ __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+ __u32 ocd_version; /* lustre release version number */
+ __u32 ocd_grant; /* initial cache grant amount (bytes) */
+ __u32 ocd_index; /* LOV index to connect to */
+ __u32 ocd_brw_size; /* Maximum BRW size in bytes */
+ __u64 ocd_ibits_known; /* inode bits this client understands */
+ __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */
+ __u8 ocd_inodespace; /* log2 of the per-inode space consumption */
+ __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */
+ __u32 ocd_unused; /* also fix lustre_swab_connect */
+ __u64 ocd_transno; /* first transno from client to be replayed */
+ __u32 ocd_group; /* MDS group on OST */
+ __u32 ocd_cksum_types; /* supported checksum algorithms */
+ __u32 ocd_max_easize; /* How big LOV EA can be on MDS */
+ __u32 ocd_instance; /* instance # of this target */
+ __u64 ocd_maxbytes; /* Maximum stripe size in bytes */
+ /* Fields after ocd_maxbytes are only accessible by the receiver
+ * if the corresponding flag in ocd_connect_flags is set. Accessing
+ * any field after ocd_maxbytes on the receiver without a valid flag
+ * may result in out-of-bound memory access and kernel oops. */
+ __u64 padding1; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding2; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */
+ __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch. Please clear any such changes
+ * with senior engineers before starting to use a new field. Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+
+extern void lustre_swab_connect(struct obd_connect_data *ocd);
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum {
+ OBD_CKSUM_CRC32 = 0x00000001,
+ OBD_CKSUM_ADLER = 0x00000002,
+ OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ * OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+ OST_REPLY = 0, /* reply ? */
+ OST_GETATTR = 1,
+ OST_SETATTR = 2,
+ OST_READ = 3,
+ OST_WRITE = 4,
+ OST_CREATE = 5,
+ OST_DESTROY = 6,
+ OST_GET_INFO = 7,
+ OST_CONNECT = 8,
+ OST_DISCONNECT = 9,
+ OST_PUNCH = 10,
+ OST_OPEN = 11,
+ OST_CLOSE = 12,
+ OST_STATFS = 13,
+ OST_SYNC = 16,
+ OST_SET_INFO = 17,
+ OST_QUOTACHECK = 18,
+ OST_QUOTACTL = 19,
+ OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+ OST_LAST_OPC
+} ost_cmd_t;
+#define OST_FIRST_OPC OST_REPLY
+
+enum obdo_flags {
+ OBD_FL_INLINEDATA = 0x00000001,
+ OBD_FL_OBDMDEXISTS = 0x00000002,
+ OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */
+ OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */
+ OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/
+ OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+ OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */
+ OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */
+ OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */
+ OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */
+ OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */
+ OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */
+ OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */
+ OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+ OBD_FL_CKSUM_RSVD2 = 0x00008000, /* for future cksum types */
+ OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */
+ OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+ OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client.
+ * XXX: obsoleted - reserved for old
+ * clients prior than 2.2 */
+ OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+ OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */
+
+ /* Note that while these checksum values are currently separate bits,
+ * in 2.x we can actually allow all values from 1-31 if we wanted. */
+ OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+ OBD_FL_CKSUM_CRC32C,
+
+ /* mask for local-only flag, which won't be sent over network */
+ OBD_FL_LOCAL_MASK = 0xF0000000,
+};
+
+#define LOV_MAGIC_V1 0x0BD10BD0
+#define LOV_MAGIC LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_MAGIC_V3 0x0BD30BD0
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ */
+#define LOV_MAGIC_V1_DEF 0x0CD10BD0
+#define LOV_MAGIC_V3_DEF 0x0CD30BD0
+
+#define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */
+#define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */
+#define LOV_PATTERN_FIRST 0x100 /* first stripe is not in round-robin */
+#define LOV_PATTERN_CMOBD 0x200
+
+#define LOV_PATTERN_F_MASK 0xffff0000
+#define LOV_PATTERN_F_RELEASED 0x80000000 /* HSM released file */
+
+#define lov_pattern(pattern) (pattern & ~LOV_PATTERN_F_MASK)
+#define lov_pattern_flags(pattern) (pattern & LOV_PATTERN_F_MASK)
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/
+ struct ost_id l_ost_oi; /* OST object ID */
+ __u32 l_ost_gen; /* generation of this l_ost_idx */
+ __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */
+ __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ /* lmm_stripe_count used to be __u32 */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ __u16 lmm_layout_gen; /* layout generation number */
+ struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ * ........
+ * __u64 lmm_object_id;
+ * __u64 lmm_object_seq;
+ * ......
+ * }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ * lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+ struct ost_id *oi)
+{
+ oi->oi.oi_id = fid_oid(fid);
+ oi->oi.oi_seq = fid_seq(fid);
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+ oi->oi.oi_seq = seq;
+}
+
+static inline __u64 lmm_oi_id(struct ost_id *oi)
+{
+ return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(struct ost_id *oi)
+{
+ return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+ struct ost_id *src_oi)
+{
+ dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+ struct ost_id *src_oi)
+{
+ dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+ dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
+
+#define MAX_MD_SIZE \
+ (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE \
+ (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default"
+#define XATTR_USER_PREFIX "user."
+#define XATTR_TRUSTED_PREFIX "trusted."
+#define XATTR_SECURITY_PREFIX "security."
+#define XATTR_LUSTRE_PREFIX "lustre."
+
+#define XATTR_NAME_LOV "trusted.lov"
+#define XATTR_NAME_LMA "trusted.lma"
+#define XATTR_NAME_LMV "trusted.lmv"
+#define XATTR_NAME_LINK "trusted.link"
+#define XATTR_NAME_FID "trusted.fid"
+#define XATTR_NAME_VERSION "trusted.version"
+#define XATTR_NAME_SOM "trusted.som"
+#define XATTR_NAME_HSM "trusted.hsm"
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace"
+
+struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */
+ __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ /* lmm_stripe_count used to be __u32 */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ __u16 lmm_layout_gen; /* layout generation number */
+ char lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+ struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+ if (lmm_magic == LOV_MAGIC_V3)
+ return sizeof(struct lov_mds_md_v3) +
+ stripes * sizeof(struct lov_ost_data_v1);
+ else
+ return sizeof(struct lov_mds_md_v1) +
+ stripes * sizeof(struct lov_ost_data_v1);
+}
+
+static inline __u32
+lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
+{
+ switch (lmm_magic) {
+ case LOV_MAGIC_V1: {
+ struct lov_mds_md_v1 lmm;
+
+ if (buf_size < sizeof(lmm))
+ return 0;
+
+ return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+ }
+ case LOV_MAGIC_V3: {
+ struct lov_mds_md_v3 lmm;
+
+ if (buf_size < sizeof(lmm))
+ return 0;
+
+ return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]);
+ }
+ default:
+ return 0;
+ }
+}
+
+#define OBD_MD_FLID (0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS (0x00200000ULL) /* quality of service stats */
+/*#define OBD_MD_FLOSCOPQ (0x00400000ULL) osc opaque data, never used */
+#define OBD_MD_FLCOOKIE (0x00800000ULL) /* log cancellation cookie */
+#define OBD_MD_FLGROUP (0x01000000ULL) /* group */
+#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH (0x04000000ULL) /* ->ost write with ioepoch */
+ /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */
+#define OBD_MD_TSTATE (0x0000000800000000ULL) /* transient state field */
+
+#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+ * under lock; for xattr
+ * requests means the
+ * client holds the lock */
+#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */
+#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */
+#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */
+#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+#define OBD_MD_FLRELEASED (0x0020000000000000ULL) /* file released */
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+ OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \
+ OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \
+ OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+ OBD_MD_FLGENER | OBD_MD_FLRDEV | OBD_MD_FLGROUP)
+
+#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+ HSS_SETMASK = 0x01,
+ HSS_CLEARMASK = 0x02,
+ HSS_ARCHIVE_ID = 0x04,
+};
+
+struct hsm_state_set {
+ __u32 hss_valid;
+ __u32 hss_archive_id;
+ __u64 hss_setmask;
+ __u64 hss_clearmask;
+};
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ 0x01
+#define OBD_BRW_WRITE 0x02
+#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous
+ * transfer and is not accounted in
+ * the grant. */
+#define OBD_BRW_CHECK 0x10
+#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED 0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA 0x100
+#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+
+#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+ struct ost_id ioo_oid; /* object ID, if multi-obj BRW */
+ __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4,
+ * now (PTLRPC_BULK_OPS_COUNT - 1) in
+ * high 16 bits in 2.4 and later */
+ __u32 ioo_bufcnt; /* number of niobufs for this object */
+};
+
+#define IOOBJ_MAX_BRW_BITS 16
+#define IOOBJ_TYPE_MASK ((1U << IOOBJ_MAX_BRW_BITS) - 1)
+#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num) \
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+ __u64 offset;
+ __u32 len;
+ __u32 flags;
+};
+
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks) \
+ ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc) \
+ do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+ __u64 lvb_size;
+ __s64 lvb_mtime;
+ __s64 lvb_atime;
+ __s64 lvb_ctime;
+ __u64 lvb_blocks;
+};
+
+extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+
+struct ost_lvb {
+ __u64 lvb_size;
+ __s64 lvb_mtime;
+ __s64 lvb_atime;
+ __s64 lvb_ctime;
+ __u64 lvb_blocks;
+ __u32 lvb_mtime_ns;
+ __u32 lvb_atime_ns;
+ __u32 lvb_ctime_ns;
+ __u32 lvb_padding;
+};
+
+extern void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+
+/*
+ * lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+#define QUOTABLOCK_BITS 10
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+#endif
+
+#ifndef toqb
+#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+ struct lu_fid qid_fid; /* FID for per-directory quota */
+ __u64 qid_uid; /* user identifier */
+ __u64 qid_gid; /* group identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+ __u32 qc_cmd;
+ __u32 qc_type; /* see Q_* flag below */
+ __u32 qc_id;
+ __u32 qc_stat;
+ struct obd_dqinfo qc_dqinfo;
+ struct obd_dqblk qc_dqblk;
+};
+
+extern void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+
+#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */
+#define Q_GETOINFO 0x800102 /* get obd quota info */
+#define Q_GETOQUOTA 0x800103 /* get obd quotas */
+#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in) \
+do { \
+ Q_COPY(out, in, qc_cmd); \
+ Q_COPY(out, in, qc_type); \
+ Q_COPY(out, in, qc_id); \
+ Q_COPY(out, in, qc_stat); \
+ Q_COPY(out, in, qc_dqinfo); \
+ Q_COPY(out, in, qc_dqblk); \
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+ struct lu_fid qb_fid; /* FID of global index packing the pool ID
+ * and type (data or metadata) as well as
+ * the quota type (user or group). */
+ union lquota_id qb_id; /* uid or gid or directory FID */
+ __u32 qb_flags; /* see below */
+ __u32 qb_padding;
+ __u64 qb_count; /* acquire/release count (kbytes/inodes) */
+ __u64 qb_usage; /* current slave usage (kbytes/inodes) */
+ __u64 qb_slv_ver; /* slave index file version */
+ struct lustre_handle qb_lockh; /* per-ID lock handle */
+ struct lustre_handle qb_glb_lockh; /* global lock handle */
+ __u64 qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ 0x1 /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ 0x2 /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL 0x4 /* release quota */
+#define QUOTA_DQACQ_FL_REPORT 0x8 /* report usage */
+
+extern void lustre_swab_quota_body(struct quota_body *b);
+
+/* Quota types currently supported */
+enum {
+ LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */
+ LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */
+ LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+ LQUOTA_RES_MD = 0x01, /* skip 0 to avoid null oid in FID */
+ LQUOTA_RES_DT = 0x02,
+ LQUOTA_LAST_RES,
+ LQUOTA_FIRST_RES = LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+ __u64 bspace; /* current space in use */
+ __u64 ispace; /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+ __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+ __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+ __u64 qbr_time; /* grace time, in seconds */
+ __u64 qbr_granted; /* how much is granted to slaves, in #inodes or
+ * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+ __u64 qsr_granted; /* space granted to the slave for the key=ID,
+ * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+ union lquota_id gl_id; /* quota ID subject to the glimpse */
+ __u64 gl_flags; /* see LQUOTA_FL* below */
+ __u64 gl_ver; /* new index version */
+ __u64 gl_hardlimit; /* new hardlimit or qunit value */
+ __u64 gl_softlimit; /* new softlimit */
+ __u64 gl_time;
+ __u64 gl_pad2;
+};
+#define gl_qunit gl_hardlimit /* current qunit value used when
+ * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+ __u64 lvb_flags; /* see LQUOTA_FL* above */
+ __u64 lvb_id_may_rel; /* space that might be released later */
+ __u64 lvb_id_rel; /* space released by the slave for this ID */
+ __u64 lvb_id_qunit; /* current qunit value */
+ __u64 lvb_pad1;
+};
+
+extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+ QUOTA_DQACQ = 601,
+ QUOTA_DQREL = 602,
+ QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC QUOTA_DQACQ
+
+/*
+ * MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+ MDS_GETATTR = 33,
+ MDS_GETATTR_NAME = 34,
+ MDS_CLOSE = 35,
+ MDS_REINT = 36,
+ MDS_READPAGE = 37,
+ MDS_CONNECT = 38,
+ MDS_DISCONNECT = 39,
+ MDS_GETSTATUS = 40,
+ MDS_STATFS = 41,
+ MDS_PIN = 42,
+ MDS_UNPIN = 43,
+ MDS_SYNC = 44,
+ MDS_DONE_WRITING = 45,
+ MDS_SET_INFO = 46,
+ MDS_QUOTACHECK = 47,
+ MDS_QUOTACTL = 48,
+ MDS_GETXATTR = 49,
+ MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */
+ MDS_WRITEPAGE = 51,
+ MDS_IS_SUBDIR = 52,
+ MDS_GET_INFO = 53,
+ MDS_HSM_STATE_GET = 54,
+ MDS_HSM_STATE_SET = 55,
+ MDS_HSM_ACTION = 56,
+ MDS_HSM_PROGRESS = 57,
+ MDS_HSM_REQUEST = 58,
+ MDS_HSM_CT_REGISTER = 59,
+ MDS_HSM_CT_UNREGISTER = 60,
+ MDS_SWAP_LAYOUTS = 61,
+ MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+ UPDATE_OBJ = 1000,
+ UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define UPDATE_FIRST_OPC UPDATE_OBJ
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+ REINT_SETATTR = 1,
+ REINT_CREATE = 2,
+ REINT_LINK = 3,
+ REINT_UNLINK = 4,
+ REINT_RENAME = 5,
+ REINT_OPEN = 6,
+ REINT_SETXATTR = 7,
+ REINT_RMENTRY = 8,
+// REINT_WRITE = 9,
+ REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+extern void lustre_swab_generic_32s (__u32 *val);
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD 0x00000001
+#define DISP_LOOKUP_EXECD 0x00000002
+#define DISP_LOOKUP_NEG 0x00000004
+#define DISP_LOOKUP_POS 0x00000008
+#define DISP_OPEN_CREATE 0x00000010
+#define DISP_OPEN_OPEN 0x00000020
+#define DISP_ENQ_COMPLETE 0x00400000 /* obsolete and unused */
+#define DISP_ENQ_OPEN_REF 0x00800000
+#define DISP_ENQ_CREATE_REF 0x01000000
+#define DISP_OPEN_LOCK 0x02000000
+#define DISP_OPEN_LEASE 0x04000000
+#define DISP_OPEN_STRIPE 0x08000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001 /* For namespace, dentry etc, and also
+ * was used to protect permission (mode,
+ * owner, group etc) before 2.4. */
+#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN 0x000004 /* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008 /* for layout */
+
+/* The PERM bit is added int 2.4, and it is used to protect permission(mode,
+ * owner, group, acl etc), so to separate the permission from LOOKUP lock.
+ * Because for remote directories(in DNE), these locks will be granted by
+ * different MDTs(different ldlm namespace).
+ *
+ * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together.
+ * For Remote directory, the master MDT, where the remote directory is, will
+ * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is,
+ * will grant LOOKUP_LOCK. */
+#define MDS_INODELOCK_PERM 0x000010
+#define MDS_INODELOCK_XATTR 0x000020 /* extended attributes */
+
+#define MDS_INODELOCK_MAXSHIFT 5
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+ LUSTRE_RES_ID_SEQ_OFF = 0,
+ LUSTRE_RES_ID_VER_OID_OFF = 1,
+ LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+ LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+ LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+ LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+/* mdt_thread_info.mti_flags. */
+enum md_op_flags {
+ /* The flag indicates Size-on-MDS attributes are changed. */
+ MF_SOM_CHANGE = (1 << 0),
+ /* Flags indicates an epoch opens or closes. */
+ MF_EPOCH_OPEN = (1 << 1),
+ MF_EPOCH_CLOSE = (1 << 2),
+ MF_MDC_CANCEL_FID1 = (1 << 3),
+ MF_MDC_CANCEL_FID2 = (1 << 4),
+ MF_MDC_CANCEL_FID3 = (1 << 5),
+ MF_MDC_CANCEL_FID4 = (1 << 6),
+ /* There is a pending attribute update. */
+ MF_SOM_AU = (1 << 7),
+ /* Cancel OST locks while getattr OST attributes. */
+ MF_GETATTR_LOCK = (1 << 8),
+ MF_GET_MDT_IDX = (1 << 9),
+};
+
+#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE)
+
+#define LUSTRE_BFLAG_UNCOMMITTED_WRITES 0x1
+
+/* these should be identical to their EXT4_*_FL counterparts, they are
+ * redefined here only to avoid dragging in fs/ext4/ext4.h */
+#define LUSTRE_SYNC_FL 0x00000008 /* Synchronous updates */
+#define LUSTRE_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define LUSTRE_APPEND_FL 0x00000020 /* writes to file may only append */
+#define LUSTRE_NOATIME_FL 0x00000080 /* do not update atime */
+#define LUSTRE_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+ return (((flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) |
+ ((flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) |
+ ((flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) |
+#if defined(S_DIRSYNC)
+ ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) |
+#endif
+ ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+ return (((iflags & S_SYNC) ? LUSTRE_SYNC_FL : 0) |
+ ((iflags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) |
+ ((iflags & S_APPEND) ? LUSTRE_APPEND_FL : 0) |
+#if defined(S_DIRSYNC)
+ ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) |
+#endif
+ ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+/* 64 possible states */
+enum md_transient_state {
+ MS_RESTORE = (1 << 0), /* restore is running */
+};
+
+struct mdt_body {
+ struct lu_fid fid1;
+ struct lu_fid fid2;
+ struct lustre_handle handle;
+ __u64 valid;
+ __u64 size; /* Offset, in the case of MDS_READPAGE */
+ __s64 mtime;
+ __s64 atime;
+ __s64 ctime;
+ __u64 blocks; /* XID, in the case of MDS_READPAGE */
+ __u64 ioepoch;
+ __u64 t_state; /* transient file state defined in
+ * enum md_transient_state
+ * was "ino" until 2.4.0 */
+ __u32 fsuid;
+ __u32 fsgid;
+ __u32 capability;
+ __u32 mode;
+ __u32 uid;
+ __u32 gid;
+ __u32 flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */
+ __u32 rdev;
+ __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */
+ __u32 unused2; /* was "generation" until 2.4.0 */
+ __u32 suppgid;
+ __u32 eadatasize;
+ __u32 aclsize;
+ __u32 max_mdsize;
+ __u32 max_cookiesize;
+ __u32 uid_h; /* high 32-bits of uid, for FUID */
+ __u32 gid_h; /* high 32-bits of gid, for FUID */
+ __u32 padding_5; /* also fix lustre_swab_mdt_body */
+ __u64 padding_6;
+ __u64 padding_7;
+ __u64 padding_8;
+ __u64 padding_9;
+ __u64 padding_10;
+}; /* 216 */
+
+extern void lustre_swab_mdt_body (struct mdt_body *b);
+
+struct mdt_ioepoch {
+ struct lustre_handle handle;
+ __u64 ioepoch;
+ __u32 flags;
+ __u32 padding;
+};
+
+extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b);
+
+/* permissions for md_perm.mp_perm */
+enum {
+ CFS_SETUID_PERM = 0x01,
+ CFS_SETGID_PERM = 0x02,
+ CFS_SETGRP_PERM = 0x04,
+ CFS_RMTACL_PERM = 0x08,
+ CFS_RMTOWN_PERM = 0x10
+};
+
+/* inode access permission for remote user, the inode info are omitted,
+ * for client knows them. */
+struct mdt_remote_perm {
+ __u32 rp_uid;
+ __u32 rp_gid;
+ __u32 rp_fsuid;
+ __u32 rp_fsuid_h;
+ __u32 rp_fsgid;
+ __u32 rp_fsgid_h;
+ __u32 rp_access_perm; /* MAY_READ/WRITE/EXEC */
+ __u32 rp_padding;
+};
+
+extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p);
+
+struct mdt_rec_setattr {
+ __u32 sa_opcode;
+ __u32 sa_cap;
+ __u32 sa_fsuid;
+ __u32 sa_fsuid_h;
+ __u32 sa_fsgid;
+ __u32 sa_fsgid_h;
+ __u32 sa_suppgid;
+ __u32 sa_suppgid_h;
+ __u32 sa_padding_1;
+ __u32 sa_padding_1_h;
+ struct lu_fid sa_fid;
+ __u64 sa_valid;
+ __u32 sa_uid;
+ __u32 sa_gid;
+ __u64 sa_size;
+ __u64 sa_blocks;
+ __s64 sa_mtime;
+ __s64 sa_atime;
+ __s64 sa_ctime;
+ __u32 sa_attr_flags;
+ __u32 sa_mode;
+ __u32 sa_bias; /* some operation flags */
+ __u32 sa_padding_3;
+ __u32 sa_padding_4;
+ __u32 sa_padding_5;
+};
+
+extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE 0x1ULL /* = 1 */
+#define MDS_ATTR_UID 0x2ULL /* = 2 */
+#define MDS_ATTR_GID 0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE 0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME 0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME 0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME 0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET 0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET 0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE 0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG 0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID 0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */
+
+#ifndef FMODE_READ
+#define FMODE_READ 00000001
+#define FMODE_WRITE 00000002
+#endif
+
+#define MDS_FMODE_CLOSED 00000000
+#define MDS_FMODE_EXEC 00000004
+/* IO Epoch is opened on a closed file. */
+#define MDS_FMODE_EPOCH 01000000
+/* IO Epoch is opened on a file truncate. */
+#define MDS_FMODE_TRUNC 02000000
+/* Size-on-MDS Attribute Update is pending. */
+#define MDS_FMODE_SOM 04000000
+
+#define MDS_OPEN_CREATED 00000010
+#define MDS_OPEN_CROSS 00000020
+
+#define MDS_OPEN_CREAT 00000100
+#define MDS_OPEN_EXCL 00000200
+#define MDS_OPEN_TRUNC 00001000
+#define MDS_OPEN_APPEND 00002000
+#define MDS_OPEN_SYNC 00010000
+#define MDS_OPEN_DIRECTORY 00200000
+
+#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file.
+ * We do not support JOIN FILE
+ * anymore, reserve this flags
+ * just for preventing such bit
+ * to be reused. */
+
+#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or
+ * hsm restore) */
+#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created
+ unlinked */
+#define MDS_OPEN_LEASE 01000000000000ULL /* Open the file and grant lease
+ * delegation, succeed if it's not
+ * being opened with conflict mode.
+ */
+#define MDS_OPEN_RELEASE 02000000000000ULL /* Open the file for HSM release */
+
+/* permission for create non-directory file */
+#define MAY_CREATE (1 << 7)
+/* permission for create directory file */
+#define MAY_LINK (1 << 8)
+/* permission for delete from the directory */
+#define MAY_UNLINK (1 << 9)
+/* source's permission for rename */
+#define MAY_RENAME_SRC (1 << 10)
+/* target's permission for rename */
+#define MAY_RENAME_TAR (1 << 11)
+/* part (parent's) VTX permission check */
+#define MAY_VTX_PART (1 << 12)
+/* full VTX permission check */
+#define MAY_VTX_FULL (1 << 13)
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL (1 << 14)
+
+enum mds_op_bias {
+ MDS_CHECK_SPLIT = 1 << 0,
+ MDS_CROSS_REF = 1 << 1,
+ MDS_VTX_BYPASS = 1 << 2,
+ MDS_PERM_BYPASS = 1 << 3,
+ MDS_SOM = 1 << 4,
+ MDS_QUOTA_IGNORE = 1 << 5,
+ MDS_CLOSE_CLEANUP = 1 << 6,
+ MDS_KEEP_ORPHAN = 1 << 7,
+ MDS_RECOV_OPEN = 1 << 8,
+ MDS_DATA_MODIFIED = 1 << 9,
+ MDS_CREATE_VOLATILE = 1 << 10,
+ MDS_OWNEROVERRIDE = 1 << 11,
+ MDS_HSM_RELEASE = 1 << 12,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+ __u32 cr_opcode;
+ __u32 cr_cap;
+ __u32 cr_fsuid;
+ __u32 cr_fsuid_h;
+ __u32 cr_fsgid;
+ __u32 cr_fsgid_h;
+ __u32 cr_suppgid1;
+ __u32 cr_suppgid1_h;
+ __u32 cr_suppgid2;
+ __u32 cr_suppgid2_h;
+ struct lu_fid cr_fid1;
+ struct lu_fid cr_fid2;
+ struct lustre_handle cr_old_handle; /* handle in case of open replay */
+ __s64 cr_time;
+ __u64 cr_rdev;
+ __u64 cr_ioepoch;
+ __u64 cr_padding_1; /* rr_blocks */
+ __u32 cr_mode;
+ __u32 cr_bias;
+ /* use of helpers set/get_mrc_cr_flags() is needed to access
+ * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+ * extend cr_flags size without breaking 1.8 compat */
+ __u32 cr_flags_l; /* for use with open, low 32 bits */
+ __u32 cr_flags_h; /* for use with open, high 32 bits */
+ __u32 cr_umask; /* umask for create */
+ __u32 cr_padding_4; /* rr_padding_4 */
+};
+
+static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+ mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+ mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc)
+{
+ return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32));
+}
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+ __u32 lk_opcode;
+ __u32 lk_cap;
+ __u32 lk_fsuid;
+ __u32 lk_fsuid_h;
+ __u32 lk_fsgid;
+ __u32 lk_fsgid_h;
+ __u32 lk_suppgid1;
+ __u32 lk_suppgid1_h;
+ __u32 lk_suppgid2;
+ __u32 lk_suppgid2_h;
+ struct lu_fid lk_fid1;
+ struct lu_fid lk_fid2;
+ __s64 lk_time;
+ __u64 lk_padding_1; /* rr_atime */
+ __u64 lk_padding_2; /* rr_ctime */
+ __u64 lk_padding_3; /* rr_size */
+ __u64 lk_padding_4; /* rr_blocks */
+ __u32 lk_bias;
+ __u32 lk_padding_5; /* rr_mode */
+ __u32 lk_padding_6; /* rr_flags */
+ __u32 lk_padding_7; /* rr_padding_2 */
+ __u32 lk_padding_8; /* rr_padding_3 */
+ __u32 lk_padding_9; /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+ __u32 ul_opcode;
+ __u32 ul_cap;
+ __u32 ul_fsuid;
+ __u32 ul_fsuid_h;
+ __u32 ul_fsgid;
+ __u32 ul_fsgid_h;
+ __u32 ul_suppgid1;
+ __u32 ul_suppgid1_h;
+ __u32 ul_suppgid2;
+ __u32 ul_suppgid2_h;
+ struct lu_fid ul_fid1;
+ struct lu_fid ul_fid2;
+ __s64 ul_time;
+ __u64 ul_padding_2; /* rr_atime */
+ __u64 ul_padding_3; /* rr_ctime */
+ __u64 ul_padding_4; /* rr_size */
+ __u64 ul_padding_5; /* rr_blocks */
+ __u32 ul_bias;
+ __u32 ul_mode;
+ __u32 ul_padding_6; /* rr_flags */
+ __u32 ul_padding_7; /* rr_padding_2 */
+ __u32 ul_padding_8; /* rr_padding_3 */
+ __u32 ul_padding_9; /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+ __u32 rn_opcode;
+ __u32 rn_cap;
+ __u32 rn_fsuid;
+ __u32 rn_fsuid_h;
+ __u32 rn_fsgid;
+ __u32 rn_fsgid_h;
+ __u32 rn_suppgid1;
+ __u32 rn_suppgid1_h;
+ __u32 rn_suppgid2;
+ __u32 rn_suppgid2_h;
+ struct lu_fid rn_fid1;
+ struct lu_fid rn_fid2;
+ __s64 rn_time;
+ __u64 rn_padding_1; /* rr_atime */
+ __u64 rn_padding_2; /* rr_ctime */
+ __u64 rn_padding_3; /* rr_size */
+ __u64 rn_padding_4; /* rr_blocks */
+ __u32 rn_bias; /* some operation flags */
+ __u32 rn_mode; /* cross-ref rename has mode */
+ __u32 rn_padding_5; /* rr_flags */
+ __u32 rn_padding_6; /* rr_padding_2 */
+ __u32 rn_padding_7; /* rr_padding_3 */
+ __u32 rn_padding_8; /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+ __u32 sx_opcode;
+ __u32 sx_cap;
+ __u32 sx_fsuid;
+ __u32 sx_fsuid_h;
+ __u32 sx_fsgid;
+ __u32 sx_fsgid_h;
+ __u32 sx_suppgid1;
+ __u32 sx_suppgid1_h;
+ __u32 sx_suppgid2;
+ __u32 sx_suppgid2_h;
+ struct lu_fid sx_fid;
+ __u64 sx_padding_1; /* These three are rr_fid2 */
+ __u32 sx_padding_2;
+ __u32 sx_padding_3;
+ __u64 sx_valid;
+ __s64 sx_time;
+ __u64 sx_padding_5; /* rr_ctime */
+ __u64 sx_padding_6; /* rr_size */
+ __u64 sx_padding_7; /* rr_blocks */
+ __u32 sx_size;
+ __u32 sx_flags;
+ __u32 sx_padding_8; /* rr_flags */
+ __u32 sx_padding_9; /* rr_padding_2 */
+ __u32 sx_padding_10; /* rr_padding_3 */
+ __u32 sx_padding_11; /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structures and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+ __u32 rr_opcode;
+ __u32 rr_cap;
+ __u32 rr_fsuid;
+ __u32 rr_fsuid_h;
+ __u32 rr_fsgid;
+ __u32 rr_fsgid_h;
+ __u32 rr_suppgid1;
+ __u32 rr_suppgid1_h;
+ __u32 rr_suppgid2;
+ __u32 rr_suppgid2_h;
+ struct lu_fid rr_fid1;
+ struct lu_fid rr_fid2;
+ __s64 rr_mtime;
+ __s64 rr_atime;
+ __s64 rr_ctime;
+ __u64 rr_size;
+ __u64 rr_blocks;
+ __u32 rr_bias;
+ __u32 rr_mode;
+ __u32 rr_flags;
+ __u32 rr_flags_h;
+ __u32 rr_umask;
+ __u32 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+
+struct lmv_desc {
+ __u32 ld_tgt_count; /* how many MDS's */
+ __u32 ld_active_tgt_count; /* how many active */
+ __u32 ld_default_stripe_count; /* how many objects are used */
+ __u32 ld_pattern; /* default MEA_MAGIC_* */
+ __u64 ld_default_hash_size;
+ __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_qos_maxage; /* in second */
+ __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */
+ struct obd_uuid ld_uuid;
+};
+
+extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
+
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+ __u32 mea_magic;
+ __u32 mea_count;
+ __u32 mea_master;
+ __u32 mea_padding;
+ char mea_pool_name[LOV_MAXPOOLNAME];
+ struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR 0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS 0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b
+
+#define MAX_HASH_SIZE_32 0x7fffffffUL
+#define MAX_HASH_SIZE 0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL
+
+enum fld_rpc_opc {
+ FLD_QUERY = 900,
+ FLD_LAST_OPC,
+ FLD_FIRST_OPC = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+ SEQ_QUERY = 700,
+ SEQ_LAST_OPC,
+ SEQ_FIRST_OPC = SEQ_QUERY
+};
+
+enum seq_op {
+ SEQ_ALLOC_SUPER = 0,
+ SEQ_ALLOC_META = 1
+};
+
+/*
+ * LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE 8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS. With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+#define LOV_DESC_QOS_MAXAGE_DEFAULT 5 /* Seconds */
+#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS)
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+ __u32 ld_tgt_count; /* how many OBD's */
+ __u32 ld_active_tgt_count; /* how many active */
+ __u32 ld_default_stripe_count; /* how many objects are used */
+ __u32 ld_pattern; /* default PATTERN_RAID0 */
+ __u64 ld_default_stripe_size; /* in bytes */
+ __u64 ld_default_stripe_offset; /* in bytes */
+ __u32 ld_padding_0; /* unused */
+ __u32 ld_qos_maxage; /* in second */
+ __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */
+ __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */
+ struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count /* for swabbing from llogs */
+
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
+/*
+ * LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+ LDLM_ENQUEUE = 101,
+ LDLM_CONVERT = 102,
+ LDLM_CANCEL = 103,
+ LDLM_BL_CALLBACK = 104,
+ LDLM_CP_CALLBACK = 105,
+ LDLM_GL_CALLBACK = 106,
+ LDLM_SET_INFO = 107,
+ LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+ __u64 name[RES_NAME_SIZE];
+};
+
+#define DLDLMRES "[%#llx:%#llx:%#llx].%llx"
+#define PLDLMRES(res) (res)->lr_name.name[0], (res)->lr_name.name[1], \
+ (res)->lr_name.name[2], (res)->lr_name.name[3]
+
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
+static inline int ldlm_res_eq(const struct ldlm_res_id *res0,
+ const struct ldlm_res_id *res1)
+{
+ return !memcmp(res0, res1, sizeof(*res0));
+}
+
+/* lock types */
+typedef enum {
+ LCK_MINMODE = 0,
+ LCK_EX = 1,
+ LCK_PW = 2,
+ LCK_PR = 4,
+ LCK_CW = 8,
+ LCK_CR = 16,
+ LCK_NL = 32,
+ LCK_GROUP = 64,
+ LCK_COS = 128,
+ LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM 8
+
+typedef enum {
+ LDLM_PLAIN = 10,
+ LDLM_EXTENT = 11,
+ LDLM_FLOCK = 12,
+ LDLM_IBITS = 13,
+ LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+ __u64 start;
+ __u64 end;
+ __u64 gid;
+};
+
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+ struct ldlm_extent *ex2)
+{
+ return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(struct ldlm_extent *ex1,
+ struct ldlm_extent *ex2)
+{
+ return (ex1->start <= ex2->start) && (ex1->end >= ex2->end);
+}
+
+struct ldlm_inodebits {
+ __u64 bits;
+};
+
+struct ldlm_flock_wire {
+ __u64 lfw_start;
+ __u64 lfw_end;
+ __u64 lfw_owner;
+ __u32 lfw_padding;
+ __u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union {
+ struct ldlm_extent l_extent;
+ struct ldlm_flock_wire l_flock;
+ struct ldlm_inodebits l_inodebits;
+} ldlm_wire_policy_data_t;
+
+extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d);
+
+union ldlm_gl_desc {
+ struct ldlm_gl_lquota_desc lquota_desc;
+};
+
+extern void lustre_swab_gl_desc(union ldlm_gl_desc *);
+
+struct ldlm_intent {
+ __u64 opc;
+};
+
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
+struct ldlm_resource_desc {
+ ldlm_type_t lr_type;
+ __u32 lr_padding; /* also fix lustre_swab_ldlm_resource_desc */
+ struct ldlm_res_id lr_name;
+};
+
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
+struct ldlm_lock_desc {
+ struct ldlm_resource_desc l_resource;
+ ldlm_mode_t l_req_mode;
+ ldlm_mode_t l_granted_mode;
+ ldlm_wire_policy_data_t l_policy_data;
+};
+
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+ __u32 lock_flags;
+ __u32 lock_count;
+ struct ldlm_lock_desc lock_desc;
+ struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
+/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available.
+ * Otherwise, 2 are available. */
+#define ldlm_request_bufsize(count, type) \
+({ \
+ int _avail = LDLM_LOCKREQ_HANDLES; \
+ _avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \
+ sizeof(struct ldlm_request) + \
+ (count > _avail ? count - _avail : 0) * \
+ sizeof(struct lustre_handle); \
+})
+
+struct ldlm_reply {
+ __u32 lock_flags;
+ __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */
+ struct ldlm_lock_desc lock_desc;
+ struct lustre_handle lock_handle;
+ __u64 lock_policy_res1;
+ __u64 lock_policy_res2;
+};
+
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
+#define ldlm_flags_to_wire(flags) ((__u32)(flags))
+#define ldlm_flags_from_wire(flags) ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+ MGS_CONNECT = 250,
+ MGS_DISCONNECT,
+ MGS_EXCEPTION, /* node died, etc. */
+ MGS_TARGET_REG, /* whenever target starts up */
+ MGS_TARGET_DEL,
+ MGS_SET_INFO,
+ MGS_CONFIG_READ,
+ MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+ char mgs_param[MGS_PARAM_MAXLEN];
+};
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN 64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX 32
+struct mgs_target_info {
+ __u32 mti_lustre_ver;
+ __u32 mti_stripe_index;
+ __u32 mti_config_ver;
+ __u32 mti_flags;
+ __u32 mti_nid_count;
+ __u32 mti_instance; /* Running instance of target */
+ char mti_fsname[MTI_NAME_MAXLEN];
+ char mti_svname[MTI_NAME_MAXLEN];
+ char mti_uuid[sizeof(struct obd_uuid)];
+ __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t)*/
+ char mti_params[MTI_PARAM_MAXLEN];
+};
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+struct mgs_nidtbl_entry {
+ __u64 mne_version; /* table version of this entry */
+ __u32 mne_instance; /* target instance # */
+ __u32 mne_index; /* target index */
+ __u32 mne_length; /* length of this entry - by bytes */
+ __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */
+ __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */
+ __u8 mne_nid_size; /* size of each NID, by bytes */
+ __u8 mne_nid_count; /* # of NIDs in buffer */
+ union {
+ lnet_nid_t nids[0]; /* variable size buffer for NIDs. */
+ } u;
+};
+extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+
+struct mgs_config_body {
+ char mcb_name[MTI_NAME_MAXLEN]; /* logname */
+ __u64 mcb_offset; /* next index of config log to request */
+ __u16 mcb_type; /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+ __u8 mcb_reserved;
+ __u8 mcb_bits; /* bits unit size of config log */
+ __u32 mcb_units; /* # of units for bulk transfer */
+};
+extern void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+
+struct mgs_config_res {
+ __u64 mcr_offset; /* index of last config log */
+ __u64 mcr_size; /* size of the log */
+};
+extern void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+
+/* Config marker flags (in config log) */
+#define CM_START 0x01
+#define CM_END 0x02
+#define CM_SKIP 0x04
+#define CM_UPGRADE146 0x08
+#define CM_EXCLUDE 0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+ __u32 cm_step; /* aka config version */
+ __u32 cm_flags;
+ __u32 cm_vers; /* lustre release version number */
+ __u32 cm_padding; /* 64 bit align */
+ __s64 cm_createtime; /*when this record was first created */
+ __s64 cm_canceltime; /*when this record is no longer valid*/
+ char cm_tgtname[MTI_NAME_MAXLEN];
+ char cm_comment[MTI_NAME_MAXLEN];
+};
+
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+ int swab, int size);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+ OBD_PING = 400,
+ OBD_LOG_CANCEL,
+ OBD_QC_CALLBACK,
+ OBD_IDX_READ,
+ OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/* catalog of log objects */
+
+/** Identifier for a single log object */
+struct llog_logid {
+ struct ost_id lgl_oi;
+ __u32 lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+ struct llog_logid lci_logid;
+ __u32 lci_padding1;
+ __u32 lci_padding2;
+ __u32 lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK 0xfff00000
+
+typedef enum {
+ LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000,
+ OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00,
+ /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */
+ MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+ REINT_UNLINK, /* obsolete after 2.5.0 */
+ MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+ REINT_UNLINK,
+ /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+ MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+ REINT_SETATTR,
+ OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000,
+ /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+ LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000,
+ /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */
+ CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000,
+ CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000,
+ HSM_AGENT_REC = LLOG_OP_MAGIC | 0x80000,
+ LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539,
+ LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+ (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+ __u32 lrh_len;
+ __u32 lrh_index;
+ __u32 lrh_type;
+ __u32 lrh_id;
+};
+
+struct llog_rec_tail {
+ __u32 lrt_len;
+ __u32 lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr) \
+ ((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec) \
+ (rec->lrh_len - sizeof(struct llog_rec_hdr) - \
+ sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+ struct llog_rec_hdr lid_hdr;
+ struct llog_logid lid_id;
+ __u32 lid_padding1;
+ __u64 lid_padding2;
+ __u64 lid_padding3;
+ struct llog_rec_tail lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+ struct llog_rec_hdr lur_hdr;
+ __u64 lur_oid;
+ __u32 lur_oseq;
+ __u32 lur_count;
+ struct llog_rec_tail lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+ struct llog_rec_hdr lur_hdr;
+ struct lu_fid lur_fid;
+ __u32 lur_count; /* to destroy the lost precreated */
+ __u32 lur_padding1;
+ __u64 lur_padding2;
+ __u64 lur_padding3;
+ struct llog_rec_tail lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+ struct llog_rec_hdr lsr_hdr;
+ struct ost_id lsr_oi;
+ __u32 lsr_uid;
+ __u32 lsr_uid_h;
+ __u32 lsr_gid;
+ __u32 lsr_gid_h;
+ __u64 lsr_padding;
+ struct llog_rec_tail lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+ struct llog_rec_hdr lsc_hdr;
+ struct ll_fid lsc_fid;
+ __u32 lsc_ioepoch;
+ __u32 lsc_padding1;
+ __u64 lsc_padding2;
+ __u64 lsc_padding3;
+ struct llog_rec_tail lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask */
+#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE)
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+ __u64 cs_recno;
+ __u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+ struct llog_rec_hdr cr_hdr;
+ struct changelog_rec cr;
+ struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+struct llog_changelog_ext_rec {
+ struct llog_rec_hdr cr_hdr;
+ struct changelog_ext_rec cr;
+ struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+ struct llog_rec_hdr cur_hdr;
+ __u32 cur_id;
+ __u32 cur_padding;
+ __u64 cur_endrec;
+ struct llog_rec_tail cur_tail;
+} __attribute__((packed));
+
+enum agent_req_status {
+ ARS_WAITING,
+ ARS_STARTED,
+ ARS_FAILED,
+ ARS_CANCELED,
+ ARS_SUCCEED,
+};
+
+static inline char *agent_req_status2name(enum agent_req_status ars)
+{
+ switch (ars) {
+ case ARS_WAITING:
+ return "WAITING";
+ case ARS_STARTED:
+ return "STARTED";
+ case ARS_FAILED:
+ return "FAILED";
+ case ARS_CANCELED:
+ return "CANCELED";
+ case ARS_SUCCEED:
+ return "SUCCEED";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static inline bool agent_req_in_final_state(enum agent_req_status ars)
+{
+ return ((ars == ARS_SUCCEED) || (ars == ARS_FAILED) ||
+ (ars == ARS_CANCELED));
+}
+
+struct llog_agent_req_rec {
+ struct llog_rec_hdr arr_hdr; /**< record header */
+ __u32 arr_status; /**< status of the request */
+ /* must match enum
+ * agent_req_status */
+ __u32 arr_archive_id; /**< backend archive number */
+ __u64 arr_flags; /**< req flags */
+ __u64 arr_compound_id; /**< compound cookie */
+ __u64 arr_req_create; /**< req. creation time */
+ __u64 arr_req_change; /**< req. status change time */
+ struct hsm_action_item arr_hai; /**< req. to the agent */
+ struct llog_rec_tail arr_tail; /**< record tail for_sizezof_only */
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+ __u64 mnt_cnt;
+ __u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+ struct llog_rec_hdr lgr_hdr;
+ struct llog_gen lgr_gen;
+ __u64 padding1;
+ __u64 padding2;
+ __u64 padding3;
+ struct llog_rec_tail lgr_tail;
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_CHUNK_SIZE 8192
+#define LLOG_HEADER_SIZE (96)
+#define LLOG_BITMAP_BYTES (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE)
+
+#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+/* flags for the logs */
+enum llog_flag {
+ LLOG_F_ZAP_WHEN_EMPTY = 0x1,
+ LLOG_F_IS_CAT = 0x2,
+ LLOG_F_IS_PLAIN = 0x4,
+};
+
+struct llog_log_hdr {
+ struct llog_rec_hdr llh_hdr;
+ __s64 llh_timestamp;
+ __u32 llh_count;
+ __u32 llh_bitmap_offset;
+ __u32 llh_size;
+ __u32 llh_flags;
+ __u32 llh_cat_idx;
+ /* for a catalog the first plain slot is next to it */
+ struct obd_uuid llh_tgtuuid;
+ __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23];
+ __u32 llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)];
+ struct llog_rec_tail llh_tail;
+} __attribute__((packed));
+
+#define LLOG_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \
+ llh->llh_bitmap_offset - \
+ sizeof(llh->llh_tail)) * 8)
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+ struct llog_logid lgc_lgl;
+ __u32 lgc_subsys;
+ __u32 lgc_index;
+ __u32 lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+ LLOG_ORIGIN_HANDLE_CREATE = 501,
+ LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502,
+ LLOG_ORIGIN_HANDLE_READ_HEADER = 503,
+ LLOG_ORIGIN_HANDLE_WRITE_REC = 504,
+ LLOG_ORIGIN_HANDLE_CLOSE = 505,
+ LLOG_ORIGIN_CONNECT = 506,
+ LLOG_CATINFO = 507, /* deprecated */
+ LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508,
+ LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/
+ LLOG_LAST_OPC,
+ LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+ struct llog_logid lgd_logid;
+ __u32 lgd_ctxt_idx;
+ __u32 lgd_llh_flags;
+ __u32 lgd_index;
+ __u32 lgd_saved_index;
+ __u32 lgd_len;
+ __u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+ struct llog_gen lgdc_gen;
+ struct llog_logid lgdc_logid;
+ __u32 lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+ __u64 o_valid; /* hot fields in this obdo */
+ struct ost_id o_oi;
+ __u64 o_parent_seq;
+ __u64 o_size; /* o_size-o_blocks == ost_lvb */
+ __s64 o_mtime;
+ __s64 o_atime;
+ __s64 o_ctime;
+ __u64 o_blocks; /* brw: cli sent cached bytes */
+ __u64 o_grant;
+
+ /* 32-bit fields start here: keep an even number of them via padding */
+ __u32 o_blksize; /* optimal IO blocksize */
+ __u32 o_mode; /* brw: cli sent cache remain */
+ __u32 o_uid;
+ __u32 o_gid;
+ __u32 o_flags;
+ __u32 o_nlink; /* brw: checksum */
+ __u32 o_parent_oid;
+ __u32 o_misc; /* brw: o_dropped */
+
+ __u64 o_ioepoch; /* epoch in ost writes */
+ __u32 o_stripe_idx; /* holds stripe idx */
+ __u32 o_parent_ver;
+ struct lustre_handle o_handle; /* brw: lock handle to prolong
+ * locks */
+ struct llog_cookie o_lcookie; /* destroy: unlink cookie from
+ * MDS */
+ __u32 o_uid_h;
+ __u32 o_gid_h;
+
+ __u64 o_data_version; /* getattr: sum of iversion for
+ * each stripe.
+ * brw: grant space consumed on
+ * the client for the write */
+ __u64 o_padding_4;
+ __u64 o_padding_5;
+ __u64 o_padding_6;
+};
+
+#define o_dirty o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum o_nlink
+#define o_grant_used o_data_version
+
+static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd,
+ struct obdo *wobdo,
+ const struct obdo *lobdo)
+{
+ *wobdo = *lobdo;
+ wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+ if (ocd == NULL)
+ return;
+
+ if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+ fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
+ /* Currently OBD_FL_OSTID will only be used when 2.4 echo
+ * client communicate with pre-2.4 server */
+ wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
+ wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
+ }
+}
+
+static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd,
+ struct obdo *lobdo,
+ const struct obdo *wobdo)
+{
+ __u32 local_flags = 0;
+
+ if (lobdo->o_valid & OBD_MD_FLFLAGS)
+ local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+ *lobdo = *wobdo;
+ if (local_flags != 0) {
+ lobdo->o_valid |= OBD_MD_FLFLAGS;
+ lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+ lobdo->o_flags |= local_flags;
+ }
+ if (ocd == NULL)
+ return;
+
+ if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+ fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) {
+ /* see above */
+ lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq;
+ lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id;
+ lobdo->o_oi.oi_fid.f_ver = 0;
+ }
+}
+
+extern void lustre_swab_obdo (struct obdo *o);
+
+/* request structure for OST's */
+struct ost_body {
+ struct obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+ char name[8];
+ struct obdo oa;
+ struct ll_user_fiemap fiemap;
+};
+
+extern void lustre_swab_ost_body (struct ost_body *b);
+extern void lustre_swab_ost_last_id(__u64 *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
+
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+ int stripe_count);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+
+/* llog_swab.c */
+extern void lustre_swab_llogd_body (struct llogd_body *d);
+extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
+extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+extern void lustre_swab_llog_id(struct llog_logid *lid);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_obdo(struct obdo *oa);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+ __u32 ii_magic;
+
+ /* reply: see idx_info_flags below */
+ __u32 ii_flags;
+
+ /* request & reply: number of lu_idxpage (to be) transferred */
+ __u16 ii_count;
+ __u16 ii_pad0;
+
+ /* request: requested attributes passed down to the iterator API */
+ __u32 ii_attrs;
+
+ /* request & reply: index file identifier (FID) */
+ struct lu_fid ii_fid;
+
+ /* reply: version of the index file before starting to walk the index.
+ * Please note that the version can be modified at any time during the
+ * transfer */
+ __u64 ii_version;
+
+ /* request: hash to start with:
+ * reply: hash of the first entry of the first lu_idxpage and hash
+ * of the entry to read next if any */
+ __u64 ii_hash_start;
+ __u64 ii_hash_end;
+
+ /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+ * set */
+ __u16 ii_keysize;
+
+ /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+ * is set */
+ __u16 ii_recsize;
+
+ __u32 ii_pad1;
+ __u64 ii_pad2;
+ __u64 ii_pad3;
+};
+extern void lustre_swab_idx_info(struct idx_info *ii);
+
+#define II_END_OFF MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+ II_FL_NOHASH = 1 << 0, /* client doesn't care about hash value */
+ II_FL_VARKEY = 1 << 1, /* keys can be of variable size */
+ II_FL_VARREC = 1 << 2, /* records can be of variable size */
+ II_FL_NONUNQ = 1 << 3, /* index supports non-unique keys */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+ /* 16-byte header */
+ __u32 lip_magic;
+ __u16 lip_flags;
+ __u16 lip_nr; /* number of entries in the container */
+ __u64 lip_pad0; /* additional padding for future use */
+
+ /* key/record pairs are stored in the remaining 4080 bytes.
+ * depending upon the flags in idx_info::ii_flags, each key/record
+ * pair might be preceded by:
+ * - a hash value
+ * - the key size (II_FL_VARKEY is set)
+ * - the record size (II_FL_VARREC is set)
+ *
+ * For the time being, we only support fixed-size key & record. */
+ char lip_entries[0];
+};
+extern void lustre_swab_lip_header(struct lu_idxpage *lip);
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+ struct lu_dirpage lp_dir; /* for MDS_READPAGE */
+ struct lu_idxpage lp_idx; /* for OBD_IDX_READ */
+ char lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+ SEC_CTX_INIT = 801,
+ SEC_CTX_INIT_CONT = 802,
+ SEC_CTX_FINI = 803,
+ SEC_LAST_OPC,
+ SEC_FIRST_OPC = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN 64
+#define CAPA_HMAC_KEY_MAX_LEN 56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+ struct lu_fid lc_fid; /** fid */
+ __u64 lc_opc; /** operations allowed */
+ __u64 lc_uid; /** file owner */
+ __u64 lc_gid; /** file group */
+ __u32 lc_flags; /** HMAC algorithm & flags */
+ __u32 lc_keyid; /** key# used for the capability */
+ __u32 lc_timeout; /** capa timeout value (sec) */
+ __u32 lc_expiry; /** expiry time (sec) */
+ __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa(struct lustre_capa *c);
+
+/** lustre_capa::lc_opc */
+enum {
+ CAPA_OPC_BODY_WRITE = 1<<0, /**< write object data */
+ CAPA_OPC_BODY_READ = 1<<1, /**< read object data */
+ CAPA_OPC_INDEX_LOOKUP = 1<<2, /**< lookup object fid */
+ CAPA_OPC_INDEX_INSERT = 1<<3, /**< insert object fid */
+ CAPA_OPC_INDEX_DELETE = 1<<4, /**< delete object fid */
+ CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */
+ CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */
+ CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */
+ CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */
+ CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */
+ CAPA_OPC_META_READ = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY \
+ (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+ CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY \
+ (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \
+ CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* MDS capability covers object capability for operations of body r/w
+ * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w,
+ * while OSS capability only covers object capability for operations of
+ * oss data(file content) r/w/truncate.
+ */
+static inline int capa_for_mds(struct lustre_capa *c)
+{
+ return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0;
+}
+
+static inline int capa_for_oss(struct lustre_capa *c)
+{
+ return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0;
+}
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+ CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+ CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK 0x00ffffff
+#define CAPA_HMAC_ALG_MASK 0xff000000
+
+struct lustre_capa_key {
+ __u64 lk_seq; /**< mds# */
+ __u32 lk_keyid; /**< key# */
+ __u32 lk_padding;
+ __u8 lk_key[CAPA_HMAC_KEY_MAX_LEN]; /**< key */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+ __u32 leh_magic;
+ __u32 leh_reccount;
+ __u64 leh_len; /* total size */
+ /* future use */
+ __u32 padding1;
+ __u32 padding2;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+ /** __u16 stored big-endian, unaligned */
+ unsigned char lee_reclen[2];
+ unsigned char lee_parent_fid[sizeof(struct lu_fid)];
+ char lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+ struct lu_fid gf_fid;
+ __u64 gf_recno;
+ __u32 gf_linkno;
+ __u32 gf_pathlen;
+ char gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+enum {
+ LAYOUT_INTENT_ACCESS = 0,
+ LAYOUT_INTENT_READ = 1,
+ LAYOUT_INTENT_WRITE = 2,
+ LAYOUT_INTENT_GLIMPSE = 3,
+ LAYOUT_INTENT_TRUNC = 4,
+ LAYOUT_INTENT_RELEASE = 5,
+ LAYOUT_INTENT_RESTORE = 6
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+ __u32 li_opc; /* intent operation for enqueue, read, write etc */
+ __u32 li_flags;
+ __u64 li_start;
+ __u64 li_end;
+};
+
+void lustre_swab_layout_intent(struct layout_intent *li);
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+ /* Field taken from struct hsm_progress */
+ lustre_fid hpk_fid;
+ __u64 hpk_cookie;
+ struct hsm_extent hpk_extent;
+ __u16 hpk_flags;
+ __u16 hpk_errval; /* positive val */
+ __u32 hpk_padding1;
+ /* Additional fields */
+ __u64 hpk_data_version;
+ __u64 hpk_padding2;
+} __attribute__((packed));
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+extern void lustre_swab_hsm_request(struct hsm_request *hr);
+
+/**
+ * These are object update opcode under UPDATE_OBJ, which is currently
+ * being used by cross-ref operations between MDT.
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ * Update request format
+ * magic: UPDATE_BUFFER_MAGIC_V1
+ * Count: How many updates in the req.
+ * bufs[0] : following are packets of object.
+ * update[0]:
+ * type: object_update_op, the op code of update
+ * fid: The object fid of the update.
+ * lens/bufs: other parameters of the update.
+ * update[1]:
+ * type: object_update_op, the op code of update
+ * fid: The object fid of the update.
+ * lens/bufs: other parameters of the update.
+ * ..........
+ * update[7]: type: object_update_op, the op code of update
+ * fid: The object fid of the update.
+ * lens/bufs: other parameters of the update.
+ * Current 8 maxim updates per object update request.
+ *
+ *******************************************************************
+ * update reply format:
+ *
+ * ur_version: UPDATE_REPLY_V1
+ * ur_count: The count of the reply, which is usually equal
+ * to the number of updates in the request.
+ * ur_lens: The reply lengths of each object update.
+ *
+ * replies: 1st update reply [4bytes_ret: other body]
+ * 2nd update reply [4bytes_ret: other body]
+ * .....
+ * nth update reply [4bytes_ret: other body]
+ *
+ * For each reply of the update, the format would be
+ * result(4 bytes):Other stuff
+ */
+
+#define UPDATE_MAX_OPS 10
+#define UPDATE_BUFFER_MAGIC_V1 0xBDDE0001
+#define UPDATE_BUFFER_MAGIC UPDATE_BUFFER_MAGIC_V1
+#define UPDATE_BUF_COUNT 8
+enum object_update_op {
+ OBJ_CREATE = 1,
+ OBJ_DESTROY = 2,
+ OBJ_REF_ADD = 3,
+ OBJ_REF_DEL = 4,
+ OBJ_ATTR_SET = 5,
+ OBJ_ATTR_GET = 6,
+ OBJ_XATTR_SET = 7,
+ OBJ_XATTR_GET = 8,
+ OBJ_INDEX_LOOKUP = 9,
+ OBJ_INDEX_INSERT = 10,
+ OBJ_INDEX_DELETE = 11,
+ OBJ_LAST
+};
+
+struct update {
+ __u32 u_type;
+ __u32 u_batchid;
+ struct lu_fid u_fid;
+ __u32 u_lens[UPDATE_BUF_COUNT];
+ __u32 u_bufs[0];
+};
+
+struct update_buf {
+ __u32 ub_magic;
+ __u32 ub_count;
+ __u32 ub_bufs[0];
+};
+
+#define UPDATE_REPLY_V1 0x00BD0001
+struct update_reply {
+ __u32 ur_version;
+ __u32 ur_count;
+ __u32 ur_lens[0];
+};
+
+void lustre_swab_update_buf(struct update_buf *ub);
+void lustre_swab_update_reply_buf(struct update_reply *ur);
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+ __u64 msl_flags;
+} __packed;
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+
+struct close_data {
+ struct lustre_handle cd_handle;
+ struct lu_fid cd_fid;
+ __u64 cd_data_version;
+ __u64 cd_reserved[8];
+};
+
+void lustre_swab_close_data(struct close_data *data);
+
+#endif
+/** @} lustreidl */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644
index 000000000..1c87a61a7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+enum lfsck_param_flags {
+ /* Reset LFSCK iterator position to the device beginning. */
+ LPF_RESET = 0x0001,
+
+ /* Exit when fail. */
+ LPF_FAILOUT = 0x0002,
+
+ /* Dryrun mode, only check without modification */
+ LPF_DRYRUN = 0x0004,
+};
+
+enum lfsck_type {
+ /* For MDT-OST consistency check/repair. */
+ LT_LAYOUT = 0x0001,
+
+ /* For MDT-MDT consistency check/repair. */
+ LT_DNE = 0x0002,
+
+ /* For FID-in-dirent and linkEA consistency check/repair. */
+ LT_NAMESPACE = 0x0004,
+};
+
+#define LFSCK_VERSION_V1 1
+#define LFSCK_VERSION_V2 2
+
+#define LFSCK_TYPES_ALL ((__u16)(~0))
+#define LFSCK_TYPES_DEF ((__u16)0)
+#define LFSCK_TYPES_SUPPORTED LT_NAMESPACE
+
+#define LFSCK_SPEED_NO_LIMIT 0
+#define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT
+
+enum lfsck_start_valid {
+ LSV_SPEED_LIMIT = 0x00000001,
+ LSV_ERROR_HANDLE = 0x00000002,
+ LSV_DRYRUN = 0x00000004,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+ /* Which arguments are valid, see 'enum lfsck_start_valid'. */
+ __u32 ls_valid;
+
+ /* How many items can be scanned at most per second. */
+ __u32 ls_speed_limit;
+
+ /* For compatibility between user space tools and kernel service. */
+ __u16 ls_version;
+
+ /* Which LFSCK components to be (have been) started. */
+ __u16 ls_active;
+
+ /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+ __u16 ls_flags;
+
+ /* For 64-bits aligned. */
+ __u16 ls_padding;
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
new file mode 100644
index 000000000..89794fdfe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
@@ -0,0 +1,1179 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include "ll_fiemap.h"
+#include "../linux/lustre_user.h"
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD _IOW('v', 2, long)
+#define FSFILT_IOC_FIEMAP _IOWR('f', 11, struct ll_user_fiemap)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+ OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */
+ OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */
+ OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
+ OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+ OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+};
+
+struct obd_statfs {
+ __u64 os_type;
+ __u64 os_blocks;
+ __u64 os_bfree;
+ __u64 os_bavail;
+ __u64 os_files;
+ __u64 os_ffree;
+ __u8 os_fsid[40];
+ __u32 os_bsize;
+ __u32 os_namelen;
+ __u64 os_maxbytes;
+ __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */
+ __u32 os_fprecreated; /* objs available now to the caller */
+ /* used in QoS code to find preferred
+ * OSTs */
+ __u32 os_spare2;
+ __u32 os_spare3;
+ __u32 os_spare4;
+ __u32 os_spare5;
+ __u32 os_spare6;
+ __u32 os_spare7;
+ __u32 os_spare8;
+ __u32 os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+ /**
+ * FID sequence. Sequence is a unit of migration: all files (objects)
+ * with FIDs from a given sequence are stored on the same server.
+ * Lustre should support 2^64 objects, so even if each sequence
+ * has only a single object we can still enumerate 2^64 objects.
+ **/
+ __u64 f_seq;
+ /* FID number within sequence. */
+ __u32 f_oid;
+ /**
+ * FID version, used to distinguish different versions (in the sense
+ * of snapshots, etc.) of the same file system object. Not currently
+ * used.
+ **/
+ __u32 f_ver;
+};
+
+struct filter_fid {
+ struct lu_fid ff_parent; /* ff_parent.f_ver == file stripe number */
+};
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+ struct lu_fid ff_parent;
+ __u64 ff_objid;
+ __u64 ff_seq;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them. Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+ /**
+ * Bitfield for supported data in this structure. From enum lma_compat.
+ * lma_self_fid and lma_flags are always available.
+ */
+ __u32 lma_compat;
+ /**
+ * Per-file incompat feature list. Lustre version should support all
+ * flags set in this field. The supported feature mask is available in
+ * LMA_INCOMPAT_SUPP.
+ */
+ __u32 lma_incompat;
+ /** FID of this inode */
+ struct lu_fid lma_self_fid;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+ union {
+ struct ostid {
+ __u64 oi_id;
+ __u64 oi_seq;
+ } oi;
+ struct lu_fid oi_fid;
+ };
+};
+
+#define DOSTID "%#llx:%llu"
+#define POSTID(oi) ostid_seq(oi), ostid_id(oi)
+
+/*
+ * The ioctl naming rules:
+ * LL_* - works on the currently opened filehandle instead of parent dir
+ * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_* - gets/sets data related to MDC
+ * *_LOV_* - gets/sets data related to OSC/LOV
+ * *FILE* - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO - set/get lov_user_mds_data
+ */
+/* see <lustre_lib.h> for ioctl numberss 101-150 */
+#define LL_IOC_GETFLAGS _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS _IOW ('f', 153, long)
+/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */
+#define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long)
+/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */
+#define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long)
+/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */
+#define LL_IOC_LOV_SETEA _IOW ('f', 156, long)
+#define LL_IOC_RECREATE_OBJ _IOW ('f', 157, long)
+#define LL_IOC_RECREATE_FID _IOW ('f', 157, struct lu_fid)
+#define LL_IOC_GROUP_LOCK _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK _IOW ('f', 159, long)
+/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */
+#define LL_IOC_QUOTACHECK _IOW ('f', 160, int)
+/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */
+#define LL_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *)
+/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */
+#define LL_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl)
+#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *)
+#define IOC_LOV_GETINFO _IOWR('f', 165, struct lov_user_mds_data *)
+#define LL_IOC_FLUSHCTX _IOW ('f', 166, long)
+#define LL_IOC_RMTACL _IOW ('f', 167, long)
+#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int)
+
+/* see <lustre_lib.h> for ioctl numbers 177-210 */
+
+#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \
+ struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION _IOR('f', 220, \
+ struct hsm_current_action)
+/* see <lustre_lib.h> for ioctl numbers 221-232 */
+
+#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64)
+#define LL_IOC_SET_LEASE _IOWR('f', 243, long)
+#define LL_IOC_GET_LEASE _IO('f', 244)
+#define LL_IOC_HSM_IMPORT _IOWR('f', 245, struct hsm_user_import)
+
+#define LL_STATFS_LMV 1
+#define LL_STATFS_LOV 2
+#define LL_STATFS_NODELAY 4
+
+#define IOC_MDC_TYPE 'i'
+#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+/* Keep these for backward compartability. */
+#define LL_IOC_OBD_STATFS IOC_OBD_STATFS
+#define IOC_MDC_GETSTRIPE IOC_MDC_GETFILESTRIPE
+
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular
+ * files, but are unlikely to be used in practice and are not harmful if
+ * used incorrectly. O_NOCTTY and FASYNC are only meaningful for character
+ * devices and are safe for use on new files (See LU-812, LU-4209). */
+#define O_LOV_DELAY_CREATE (O_NOCTTY | FASYNC)
+
+#define LL_FILE_IGNORE_LOCK 0x00000001
+#define LL_FILE_GROUP_LOCKED 0x00000002
+#define LL_FILE_READAHEA 0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */
+#define LL_FILE_RMTACL 0x00000020
+
+#define LOV_USER_MAGIC_V1 0x0BD10BD0
+#define LOV_USER_MAGIC LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
+
+#define LMV_MAGIC_V1 0x0CD10CD0 /*normal stripe lmv magic */
+#define LMV_USER_MAGIC 0x0CD20CD0 /*default lmv magic*/
+
+#define LOV_PATTERN_RAID0 0x001
+#define LOV_PATTERN_RAID1 0x002
+#define LOV_PATTERN_FIRST 0x100
+
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
+#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simplified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES 0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 { /* per-stripe data structure */
+ struct ost_id l_ost_oi; /* OST object ID */
+ __u32 l_ost_gen; /* generation of this OST index */
+ __u32 l_ost_idx; /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 { /* LOV EA user data (host-endian) */
+ __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ union {
+ __u16 lmm_stripe_offset; /* starting stripe offset in
+ * lmm_objects, use when writing */
+ __u16 lmm_layout_gen; /* layout generation number
+ * used when reading */
+ };
+ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed, __may_alias__));
+
+struct lov_user_md_v3 { /* LOV EA user data (host-endian) */
+ __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ struct ost_id lmm_oi; /* LOV object ID */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ __u16 lmm_stripe_count; /* num stripes in use for this object */
+ union {
+ __u16 lmm_stripe_offset; /* starting stripe offset in
+ * lmm_objects, use when writing */
+ __u16 lmm_layout_gen; /* layout generation number
+ * used when reading */
+ };
+ char lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
+{
+ if (lmm_magic == LOV_USER_MAGIC_V3)
+ return sizeof(struct lov_user_md_v3) +
+ stripes * sizeof(struct lov_user_ost_data_v1);
+ else
+ return sizeof(struct lov_user_md_v1) +
+ stripes * sizeof(struct lov_user_ost_data_v1);
+}
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this. It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+ lstat_t lmd_st; /* MDS stat struct */
+ struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+ lstat_t lmd_st; /* MDS stat struct */
+ struct lov_user_md_v3 lmd_lmm; /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+/* keep this to be the same size as lov_user_ost_data_v1 */
+struct lmv_user_mds_data {
+ struct lu_fid lum_fid;
+ __u32 lum_padding;
+ __u32 lum_mds;
+};
+
+/* lum_type */
+enum {
+ LMV_STRIPE_TYPE = 0,
+ LMV_DEFAULT_TYPE = 1,
+};
+
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+ __u32 lum_magic; /* must be the first field */
+ __u32 lum_stripe_count; /* dirstripe count */
+ __u32 lum_stripe_offset; /* MDT idx for default dirstripe */
+ __u32 lum_hash_type; /* Dir stripe policy */
+ __u32 lum_type; /* LMV type: default or normal */
+ __u32 lum_padding1;
+ __u32 lum_padding2;
+ __u32 lum_padding3;
+ char lum_pool_name[LOV_MAXPOOLNAME];
+ struct lmv_user_mds_data lum_objects[0];
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+ return sizeof(struct lmv_user_md) +
+ stripes * sizeof(struct lmv_user_mds_data);
+}
+
+extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+
+struct ll_recreate_obj {
+ __u64 lrc_id;
+ __u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+ __u64 id; /* holds object id */
+ __u32 generation; /* holds object generation */
+ __u32 f_type; /* holds object type or stripe idx when passing it to
+ * OST for saving into EA. */
+};
+
+#define UUID_MAX 40
+struct obd_uuid {
+ char uuid[UUID_MAX];
+};
+
+static inline bool obd_uuid_equals(const struct obd_uuid *u1,
+ const struct obd_uuid *u2)
+{
+ return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+ return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+ strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+ uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(const struct obd_uuid *uuid)
+{
+ if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+ /* Obviously not safe, but for printfs, no real harm done...
+ we're always null-terminated, even in a race. */
+ static char temp[sizeof(*uuid)];
+ memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+ temp[sizeof(*uuid) - 1] = '\0';
+ return temp;
+ }
+ return (char *)(uuid->uuid);
+}
+
+/* Extract fsname from uuid (or target name) of a target
+ e.g. (myfs-OST0007_UUID -> myfs)
+ see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+ char *p;
+
+ strncpy(buf, uuid, buflen - 1);
+ buf[buflen - 1] = '\0';
+ p = strrchr(buf, '-');
+ if (p)
+ *p = '\0';
+}
+
+/* printf display format
+ e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define FID_NOBRACE_LEN 40
+#define FID_LEN (FID_NOBRACE_LEN + 2)
+#define DFID_NOBRACE "%#llx:0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid) \
+ (fid)->f_seq, \
+ (fid)->f_oid, \
+ (fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+ e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid) \
+ &((fid)->f_seq), \
+ &((fid)->f_oid), \
+ &((fid)->f_ver)
+
+
+/********* Quotas **********/
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */
+#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */
+#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */
+#define LUSTRE_Q_FINVALIDATE 0x80000c /* invalidate filter quota data */
+
+#define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */
+
+struct if_quotacheck {
+ char obd_type[16];
+ struct obd_uuid obd_uuid;
+};
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX 64
+
+struct perm_downcall_data {
+ __u64 pdd_nid;
+ __u32 pdd_perm;
+ __u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+ __u32 idd_magic;
+ __u32 idd_err;
+ __u32 idd_uid;
+ __u32 idd_gid;
+ __u32 idd_nperms;
+ __u32 idd_ngroups;
+ struct perm_downcall_data idd_perms[N_PERMS_MAX];
+ __u32 idd_groups[0];
+};
+
+/* for non-mapped uid/gid */
+#define NOBODY_UID 99
+#define NOBODY_GID 99
+
+#define INVALID_ID (-1)
+
+enum {
+ RMT_LSETFACL = 1,
+ RMT_LGETFACL = 2,
+ RMT_RSETFACL = 3,
+ RMT_RGETFACL = 4
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS 1
+#define QIF_SPACE 2
+#define QIF_ILIMITS 4
+#define QIF_INODES 8
+#define QIF_BTIME 16
+#define QIF_ITIME 32
+#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN 14
+/* hdr + MDT index */
+#define LUSTRE_VOLATILE_IDX LUSTRE_VOLATILE_HDR":%.4X:"
+
+typedef enum lustre_quota_version {
+ LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+ __u64 dqi_bgrace;
+ __u64 dqi_igrace;
+ __u32 dqi_flags;
+ __u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+ __u64 dqb_bhardlimit;
+ __u64 dqb_bsoftlimit;
+ __u64 dqb_curspace;
+ __u64 dqb_ihardlimit;
+ __u64 dqb_isoftlimit;
+ __u64 dqb_curinodes;
+ __u64 dqb_btime;
+ __u64 dqb_itime;
+ __u32 dqb_valid;
+ __u32 dqb_padding;
+};
+
+enum {
+ QC_GENERAL = 0,
+ QC_MDTIDX = 1,
+ QC_OSTIDX = 2,
+ QC_UUID = 3
+};
+
+struct if_quotactl {
+ __u32 qc_cmd;
+ __u32 qc_type;
+ __u32 qc_id;
+ __u32 qc_stat;
+ __u32 qc_valid;
+ __u32 qc_idx;
+ struct obd_dqinfo qc_dqinfo;
+ struct obd_dqblk qc_dqblk;
+ char obd_type[16];
+ struct obd_uuid obd_uuid;
+};
+
+/* swap layout flags */
+#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0)
+#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1)
+#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2)
+#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3)
+
+/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
+#define SWAP_LAYOUTS_MDS_HSM (1 << 31)
+struct lustre_swap_layouts {
+ __u64 sl_flags;
+ __u32 sl_fd;
+ __u32 sl_gid;
+ __u64 sl_dv1;
+ __u64 sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+ CL_MARK = 0,
+ CL_CREATE = 1, /* namespace */
+ CL_MKDIR = 2, /* namespace */
+ CL_HARDLINK = 3, /* namespace */
+ CL_SOFTLINK = 4, /* namespace */
+ CL_MKNOD = 5, /* namespace */
+ CL_UNLINK = 6, /* namespace */
+ CL_RMDIR = 7, /* namespace */
+ CL_RENAME = 8, /* namespace */
+ CL_EXT = 9, /* namespace extended record (2nd half of rename) */
+ CL_OPEN = 10, /* not currently used */
+ CL_CLOSE = 11, /* may be written to log only with mtime change */
+ CL_LAYOUT = 12, /* file layout/striping modified */
+ CL_TRUNC = 13,
+ CL_SETATTR = 14,
+ CL_XATTR = 15,
+ CL_HSM = 16, /* HSM specific events, see flags */
+ CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */
+ CL_CTIME = 18,
+ CL_ATIME = 19,
+ CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+ static const char *changelog_str[] = {
+ "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+ "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC",
+ "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME",
+ };
+
+ if (type >= 0 && type < CL_LAST)
+ return changelog_str[type];
+ return NULL;
+}
+
+/* per-record flags */
+#define CLF_VERSION 0x1000
+#define CLF_EXT_VERSION 0x2000
+#define CLF_FLAGSHIFT 12
+#define CLF_FLAGMASK ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK (~CLF_FLAGMASK)
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+ /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink of target */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H 6
+#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H 9
+#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H 11
+#define CLF_HSM_SPARE_L 12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H 15
+#define CLF_HSM_LAST 15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+ >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS 0x00
+#define CLF_HSM_MAXERROR 0x7E
+#define CLF_HSM_ERROVERFLOW 0x7F
+
+#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+ HE_ARCHIVE = 0,
+ HE_RESTORE = 1,
+ HE_CANCEL = 2,
+ HE_RELEASE = 3,
+ HE_REMOVE = 4,
+ HE_STATE = 5,
+ HE_SPARE1 = 6,
+ HE_SPARE2 = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+ return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+ *flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+ return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+ *flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+ return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+ *flags |= (error << CLF_HSM_ERR_L);
+}
+
+#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + \
+ sizeof(struct changelog_ext_rec))
+
+struct changelog_rec {
+ __u16 cr_namelen;
+ __u16 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */
+ __u32 cr_type; /**< \a changelog_rec_type */
+ __u64 cr_index; /**< changelog record number */
+ __u64 cr_prev; /**< last index for this target fid */
+ __u64 cr_time;
+ union {
+ lustre_fid cr_tfid; /**< target fid */
+ __u32 cr_markerflags; /**< CL_MARK flags */
+ };
+ lustre_fid cr_pfid; /**< parent fid */
+ char cr_name[0]; /**< last element */
+} __attribute__((packed));
+
+/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save
+ * space, only rename uses changelog_ext_rec, while others use changelog_rec to
+ * store records.
+ */
+struct changelog_ext_rec {
+ __u16 cr_namelen;
+ __u16 cr_flags; /**< (flags & CLF_FLAGMASK) |
+ CLF_EXT_VERSION */
+ __u32 cr_type; /**< \a changelog_rec_type */
+ __u64 cr_index; /**< changelog record number */
+ __u64 cr_prev; /**< last index for this target fid */
+ __u64 cr_time;
+ union {
+ lustre_fid cr_tfid; /**< target fid */
+ __u32 cr_markerflags; /**< CL_MARK flags */
+ };
+ lustre_fid cr_pfid; /**< target parent fid */
+ lustre_fid cr_sfid; /**< source fid, or zero */
+ lustre_fid cr_spfid; /**< source parent fid, or zero */
+ char cr_name[0]; /**< last element */
+} __attribute__((packed));
+
+#define CHANGELOG_REC_EXTENDED(rec) \
+ (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION)
+
+static inline int changelog_rec_size(struct changelog_rec *rec)
+{
+ return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec):
+ sizeof(*rec);
+}
+
+static inline char *changelog_rec_name(struct changelog_rec *rec)
+{
+ return CHANGELOG_REC_EXTENDED(rec) ?
+ ((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name;
+}
+
+static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec)
+{
+ return rec->cr_namelen - strlen(rec->cr_name) - 1;
+}
+
+static inline char *changelog_rec_sname(struct changelog_ext_rec *rec)
+{
+ return rec->cr_name + strlen(rec->cr_name) + 1;
+}
+
+struct ioc_changelog {
+ __u64 icc_recno;
+ __u32 icc_mdtindex;
+ __u32 icc_id;
+ __u32 icc_flags;
+};
+
+enum changelog_message_type {
+ CL_RECORD = 10, /* message is a changelog_rec */
+ CL_EOF = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+ __u64 idv_version;
+ __u64 idv_flags; /* See LL_DV_xxx */
+};
+#define LL_DV_NOFLUSH 0x01 /* Do not take READ EXTENT LOCK before sampling
+ version. Dirty caches are left unchanged. */
+
+#ifndef offsetof
+# define offsetof(typ, memb) ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+ HS_EXISTS = 0x00000001,
+ HS_DIRTY = 0x00000002,
+ HS_RELEASED = 0x00000004,
+ HS_ARCHIVED = 0x00000008,
+ HS_NORELEASE = 0x00000010,
+ HS_NOARCHIVE = 0x00000020,
+ HS_LOST = 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+ HPS_WAITING = 1,
+ HPS_RUNNING = 2,
+ HPS_DONE = 3,
+};
+#define HPS_NONE 0
+
+static inline char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+ switch (s) {
+ case HPS_WAITING: return "waiting";
+ case HPS_RUNNING: return "running";
+ case HPS_DONE: return "done";
+ default: return "unknown";
+ }
+}
+
+struct hsm_extent {
+ __u64 offset;
+ __u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+ /** Current HSM states, from enum hsm_states. */
+ __u32 hus_states;
+ __u32 hus_archive_id;
+ /** The current undergoing action, if there is one */
+ __u32 hus_in_progress_state;
+ __u32 hus_in_progress_action;
+ struct hsm_extent hus_in_progress_location;
+ char hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+ struct lu_fid hssi_fid;
+ __u64 hssi_setmask;
+ __u64 hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is returned to user space and send over the wire
+ */
+struct hsm_current_action {
+ /** The current undergoing action, if there is one */
+ /* state is one of hsm_progress_states */
+ __u32 hca_state;
+ /* action is one of hsm_user_action */
+ __u32 hca_action;
+ struct hsm_extent hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+ HUA_NONE = 1, /* no action (noop) */
+ HUA_ARCHIVE = 10, /* copy to hsm */
+ HUA_RESTORE = 11, /* prestage */
+ HUA_RELEASE = 12, /* drop ost objects */
+ HUA_REMOVE = 13, /* remove from archive */
+ HUA_CANCEL = 14 /* cancel a request */
+};
+
+static inline char *hsm_user_action2name(enum hsm_user_action a)
+{
+ switch (a) {
+ case HUA_NONE: return "NOOP";
+ case HUA_ARCHIVE: return "ARCHIVE";
+ case HUA_RESTORE: return "RESTORE";
+ case HUA_RELEASE: return "RELEASE";
+ case HUA_REMOVE: return "REMOVE";
+ case HUA_CANCEL: return "CANCEL";
+ default: return "UNKNOWN";
+ }
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, connot be set by user */
+#define HSM_GHOST_COPY 0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+ __u32 hr_action; /* enum hsm_user_action */
+ __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */
+ __u64 hr_flags; /* request flags */
+ __u32 hr_itemcount; /* item count in hur_user_item vector */
+ __u32 hr_data_len;
+};
+
+struct hsm_user_item {
+ lustre_fid hui_fid;
+ struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+ struct hsm_request hur_request;
+ struct hsm_user_item hur_user_item[0];
+ /* extra data blob at end of struct (after all
+ * hur_user_items), only use helpers to access it
+ */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+ return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/**
+ * Compute the current length of the provided hsm_user_request. This returns -1
+ * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ]
+ *
+ * return -1 on bounds check error.
+ */
+static inline ssize_t hur_len(struct hsm_user_request *hur)
+{
+ __u64 size;
+
+ /* can't overflow a __u64 since hr_itemcount is only __u32 */
+ size = offsetof(struct hsm_user_request, hur_user_item[0]) +
+ (__u64)hur->hur_request.hr_itemcount *
+ sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len;
+
+ if (size != (ssize_t)size)
+ return -1;
+
+ return size;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+ HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+ HSMA_NONE = 10, /* no action */
+ HSMA_ARCHIVE = 20, /* arbitrary offset */
+ HSMA_RESTORE = 21,
+ HSMA_REMOVE = 22,
+ HSMA_CANCEL = 23
+};
+
+static inline char *hsm_copytool_action2name(enum hsm_copytool_action a)
+{
+ switch (a) {
+ case HSMA_NONE: return "NOOP";
+ case HSMA_ARCHIVE: return "ARCHIVE";
+ case HSMA_RESTORE: return "RESTORE";
+ case HSMA_REMOVE: return "REMOVE";
+ case HSMA_CANCEL: return "CANCEL";
+ default: return "UNKNOWN";
+ }
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+ __u32 hai_len; /* valid size of this struct */
+ __u32 hai_action; /* hsm_copytool_action, but use known size */
+ lustre_fid hai_fid; /* Lustre FID to operated on */
+ lustre_fid hai_dfid; /* fid used for data access */
+ struct hsm_extent hai_extent; /* byte range to operate on */
+ __u64 hai_cookie; /* action cookie from coordinator */
+ __u64 hai_gid; /* grouplock id */
+ char hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/*
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ * \param hai [IN] record to print
+ * \param buffer [OUT] output buffer
+ * \param len [IN] max buffer len
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+ char *buffer, int len)
+{
+ int i, sz, data_len;
+ char *ptr;
+
+ ptr = buffer;
+ sz = len;
+ data_len = hai->hai_len - sizeof(*hai);
+ for (i = 0 ; (i < data_len) && (sz > 0) ; i++) {
+ int cnt;
+
+ cnt = snprintf(ptr, sz, "%.2X",
+ (unsigned char)hai->hai_data[i]);
+ ptr += cnt;
+ sz -= cnt;
+ }
+ *ptr = '\0';
+ return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+ __u32 hal_version;
+ __u32 hal_count; /* number of hai's to follow */
+ __u64 hal_compound_id; /* returned by coordinator */
+ __u64 hal_flags;
+ __u32 hal_archive_id; /* which archive backend */
+ __u32 padding1;
+ char hal_fsname[0]; /* null-terminated */
+ /* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+ boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+ return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item *hai_zero(struct hsm_action_list *hal)
+{
+ return (struct hsm_action_item *)(hal->hal_fsname +
+ cfs_size_round(strlen(hal-> \
+ hal_fsname)
+ + 1));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item *hai_next(struct hsm_action_item *hai)
+{
+ return (struct hsm_action_item *)((char *)hai +
+ cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline int hal_size(struct hsm_action_list *hal)
+{
+ int i, sz;
+ struct hsm_action_item *hai;
+
+ sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1);
+ hai = hai_zero(hal);
+ for (i = 0; i < hal->hal_count; i++, hai = hai_next(hai))
+ sz += cfs_size_round(hai->hai_len);
+
+ return sz;
+}
+
+/* HSM file import
+ * describe the attributes to be set on imported file
+ */
+struct hsm_user_import {
+ __u64 hui_size;
+ __u64 hui_atime;
+ __u64 hui_mtime;
+ __u32 hui_atime_ns;
+ __u32 hui_mtime_ns;
+ __u32 hui_uid;
+ __u32 hui_gid;
+ __u32 hui_mode;
+ __u32 hui_archive_id;
+};
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY 0x02
+
+struct hsm_progress {
+ lustre_fid hp_fid;
+ __u64 hp_cookie;
+ struct hsm_extent hp_extent;
+ __u16 hp_flags;
+ __u16 hp_errval; /* positive val */
+ __u32 padding;
+};
+
+struct hsm_copy {
+ __u64 hc_data_version;
+ __u16 hc_flags;
+ __u16 hc_errval; /* positive val */
+ __u32 padding;
+ struct hsm_action_item hc_hai;
+};
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h
new file mode 100644
index 000000000..aa4cfa7b7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_acl.h
@@ -0,0 +1,49 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+
+#define LUSTRE_POSIX_ACL_MAX_ENTRIES 32
+#define LUSTRE_POSIX_ACL_MAX_SIZE \
+ (sizeof(posix_acl_xattr_header) + \
+ LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h
new file mode 100644
index 000000000..fe19534eb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_capa.h
@@ -0,0 +1,305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_capa.h
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#ifndef __LINUX_CAPA_H_
+#define __LINUX_CAPA_H_
+
+/** \defgroup capa capa
+ *
+ * @{
+ */
+
+/*
+ * capability
+ */
+#include <linux/crypto.h>
+#include "lustre/lustre_idl.h"
+
+#define CAPA_TIMEOUT 1800 /* sec, == 30 min */
+#define CAPA_KEY_TIMEOUT (24 * 60 * 60) /* sec, == 1 days */
+
+struct capa_hmac_alg {
+ const char *ha_name;
+ int ha_len;
+ int ha_keylen;
+};
+
+#define DEF_CAPA_HMAC_ALG(name, type, len, keylen) \
+[CAPA_HMAC_ALG_ ## type] = { \
+ .ha_name = name, \
+ .ha_len = len, \
+ .ha_keylen = keylen, \
+}
+
+struct client_capa {
+ struct inode *inode;
+ struct list_head lli_list; /* link to lli_oss_capas */
+};
+
+struct target_capa {
+ struct hlist_node c_hash; /* link to capa hash */
+};
+
+struct obd_capa {
+ struct list_head c_list; /* link to capa_list */
+
+ struct lustre_capa c_capa; /* capa */
+ atomic_t c_refc; /* ref count */
+ unsigned long c_expiry; /* jiffies */
+ spinlock_t c_lock; /* protect capa content */
+ int c_site;
+
+ union {
+ struct client_capa cli;
+ struct target_capa tgt;
+ } u;
+};
+
+enum {
+ CAPA_SITE_CLIENT = 0,
+ CAPA_SITE_SERVER,
+ CAPA_SITE_MAX
+};
+
+static inline struct lu_fid *capa_fid(struct lustre_capa *capa)
+{
+ return &capa->lc_fid;
+}
+
+static inline __u64 capa_opc(struct lustre_capa *capa)
+{
+ return capa->lc_opc;
+}
+
+static inline __u64 capa_uid(struct lustre_capa *capa)
+{
+ return capa->lc_uid;
+}
+
+static inline __u64 capa_gid(struct lustre_capa *capa)
+{
+ return capa->lc_gid;
+}
+
+static inline __u32 capa_flags(struct lustre_capa *capa)
+{
+ return capa->lc_flags & 0xffffff;
+}
+
+static inline __u32 capa_alg(struct lustre_capa *capa)
+{
+ return (capa->lc_flags >> 24);
+}
+
+static inline __u32 capa_keyid(struct lustre_capa *capa)
+{
+ return capa->lc_keyid;
+}
+
+static inline __u64 capa_key_seq(struct lustre_capa_key *key)
+{
+ return key->lk_seq;
+}
+
+static inline __u32 capa_key_keyid(struct lustre_capa_key *key)
+{
+ return key->lk_keyid;
+}
+
+static inline __u32 capa_timeout(struct lustre_capa *capa)
+{
+ return capa->lc_timeout;
+}
+
+static inline __u32 capa_expiry(struct lustre_capa *capa)
+{
+ return capa->lc_expiry;
+}
+
+void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *,
+ const char *fmt, ...);
+#define DEBUG_CAPA(level, capa, fmt, args...) \
+do { \
+ if (((level) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (level)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ _debug_capa((capa), &msgdata, fmt, ##args); \
+ } \
+} while (0)
+
+#define DEBUG_CAPA_KEY(level, k, fmt, args...) \
+do { \
+CDEBUG(level, fmt " capability key@%p seq %llu keyid %u\n", \
+ ##args, k, capa_key_seq(k), capa_key_keyid(k)); \
+} while (0)
+
+typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *);
+
+/* obdclass/capa.c */
+extern struct list_head capa_list[];
+extern spinlock_t capa_lock;
+extern int capa_count[];
+extern struct kmem_cache *capa_cachep;
+
+struct hlist_head *init_capa_hash(void);
+void cleanup_capa_hash(struct hlist_head *hash);
+
+struct obd_capa *capa_add(struct hlist_head *hash,
+ struct lustre_capa *capa);
+struct obd_capa *capa_lookup(struct hlist_head *hash,
+ struct lustre_capa *capa, int alive);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key);
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+void capa_cpy(void *dst, struct obd_capa *ocapa);
+static inline struct obd_capa *alloc_capa(int site)
+{
+ struct obd_capa *ocapa;
+
+ if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
+ return ERR_PTR(-EINVAL);
+
+ OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
+ if (unlikely(!ocapa))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ocapa->c_list);
+ atomic_set(&ocapa->c_refc, 1);
+ spin_lock_init(&ocapa->c_lock);
+ ocapa->c_site = site;
+ if (ocapa->c_site == CAPA_SITE_CLIENT)
+ INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+ else
+ INIT_HLIST_NODE(&ocapa->u.tgt.c_hash);
+
+ return ocapa;
+}
+
+static inline struct obd_capa *capa_get(struct obd_capa *ocapa)
+{
+ if (!ocapa)
+ return NULL;
+
+ atomic_inc(&ocapa->c_refc);
+ return ocapa;
+}
+
+static inline void capa_put(struct obd_capa *ocapa)
+{
+ if (!ocapa)
+ return;
+
+ if (atomic_read(&ocapa->c_refc) == 0) {
+ DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for");
+ LBUG();
+ }
+
+ if (atomic_dec_and_test(&ocapa->c_refc)) {
+ LASSERT(list_empty(&ocapa->c_list));
+ if (ocapa->c_site == CAPA_SITE_CLIENT) {
+ LASSERT(list_empty(&ocapa->u.cli.lli_list));
+ } else {
+ struct hlist_node *hnode;
+
+ hnode = &ocapa->u.tgt.c_hash;
+ LASSERT(!hnode->next && !hnode->pprev);
+ }
+ OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa));
+ }
+}
+
+static inline int open_flags_to_accmode(int flags)
+{
+ int mode = flags;
+
+ if ((mode + 1) & O_ACCMODE)
+ mode++;
+ if (mode & O_TRUNC)
+ mode |= 2;
+
+ return mode;
+}
+
+static inline __u64 capa_open_opc(int mode)
+{
+ return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ;
+}
+
+static inline void set_capa_expiry(struct obd_capa *ocapa)
+{
+ unsigned long expiry = cfs_time_sub((unsigned long)ocapa->c_capa.lc_expiry,
+ get_seconds());
+ ocapa->c_expiry = cfs_time_add(cfs_time_current(),
+ cfs_time_seconds(expiry));
+}
+
+static inline int capa_is_expired_sec(struct lustre_capa *capa)
+{
+ return (capa->lc_expiry - get_seconds() <= 0);
+}
+
+static inline int capa_is_expired(struct obd_capa *ocapa)
+{
+ return time_before_eq(ocapa->c_expiry, cfs_time_current());
+}
+
+static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc)
+{
+ return (capa_opc(capa) & opc) == opc;
+}
+
+struct filter_capa_key {
+ struct list_head k_list;
+ struct lustre_capa_key k_key;
+};
+
+enum {
+ LC_ID_NONE = 0,
+ LC_ID_PLAIN = 1,
+ LC_ID_CONVERT = 2
+};
+
+#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT)
+
+/** @} capa */
+
+#endif /* __LINUX_CAPA_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h
new file mode 100644
index 000000000..7b385b872
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_cfg.h
@@ -0,0 +1,293 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_CFG_H
+#define _LUSTRE_CFG_H
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+ cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail. */
+#define LCFG_REQUIRED 0x0001000
+
+enum lcfg_command_type {
+ LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */
+ LCFG_DETACH = 0x00cf002, /**< destroy obd instance */
+ LCFG_SETUP = 0x00cf003, /**< call type-specific setup */
+ LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup */
+ LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */
+ LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from a niduuid */
+ LCFG_MOUNTOPT = 0x00cf007, /**< create a profile (mdc, osc) */
+ LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */
+ LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */
+ LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */
+ LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to an obd */
+ LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */
+ LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */
+ LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */
+ LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */
+ LCFG_MARKER = 0x00cf010, /**< metadata about next cfg rec */
+ LCFG_LOG_START = 0x00ce011, /**< mgc only, process a cfg log */
+ LCFG_LOG_END = 0x00ce012, /**< stop processing updates */
+ LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, inactive */
+ LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */
+ LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */
+ LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */
+ LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */
+ LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */
+ LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */
+ LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */
+ LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */
+ LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre
+ * cleanup cleanup */
+ LCFG_SET_PARAM = 0x00ce032, /**< use set_param syntax to set
+ *a proc parameters */
+};
+
+struct lustre_cfg_bufs {
+ void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+ __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+ __u32 lcfg_bufcount;
+};
+
+struct lustre_cfg {
+ __u32 lcfg_version;
+ __u32 lcfg_command;
+
+ __u32 lcfg_num;
+ __u32 lcfg_flags;
+ __u64 lcfg_nid;
+ __u32 lcfg_nal; /* not used any more */
+
+ __u32 lcfg_bufcount;
+ __u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+ PORTALS_CFG_TYPE = 1,
+ LUSTRE_CFG_TYPE = 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx) \
+ ((lcfg)->lcfg_bufcount <= (idx) \
+ ? 0 \
+ : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+ __u32 index,
+ void *buf,
+ __u32 buflen)
+{
+ if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+ return;
+ if (bufs == NULL)
+ return;
+
+ if (bufs->lcfg_bufcount <= index)
+ bufs->lcfg_bufcount = index + 1;
+
+ bufs->lcfg_buf[index] = buf;
+ bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+ __u32 index,
+ char *str)
+{
+ lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+ memset((bufs), 0, sizeof(*bufs));
+ if (name)
+ lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+ int i;
+ int offset;
+ int bufcount;
+ LASSERT (lcfg != NULL);
+ LASSERT (index >= 0);
+
+ bufcount = lcfg->lcfg_bufcount;
+ if (index >= bufcount)
+ return NULL;
+
+ offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+ for (i = 0; i < index; i++)
+ offset += cfs_size_round(lcfg->lcfg_buflens[i]);
+ return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+ struct lustre_cfg *lcfg)
+{
+ int i;
+ bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+ for (i = 0; i < bufs->lcfg_bufcount; i++) {
+ bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+ bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+ }
+}
+
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
+{
+ char *s;
+
+ if (lcfg->lcfg_buflens[index] == 0)
+ return NULL;
+
+ s = lustre_cfg_buf(lcfg, index);
+ if (s == NULL)
+ return NULL;
+
+ /*
+ * make sure it's NULL terminated, even if this kills a char
+ * of data. Try to use the padding first though.
+ */
+ if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+ int last = min((int)lcfg->lcfg_buflens[index],
+ cfs_size_round(lcfg->lcfg_buflens[index]) - 1);
+ char lost = s[last];
+ s[last] = '\0';
+ if (lost != '\0') {
+ CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+ index, s, lost);
+ }
+ }
+ return s;
+}
+
+static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+ int i;
+ int len;
+
+ len = LCFG_HDR_SIZE(bufcount);
+ for (i = 0; i < bufcount; i++)
+ len += cfs_size_round(buflens[i]);
+
+ return cfs_size_round(len);
+}
+
+
+#include "obd_support.h"
+
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+ struct lustre_cfg_bufs *bufs)
+{
+ struct lustre_cfg *lcfg;
+ char *ptr;
+ int i;
+
+ OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+ bufs->lcfg_buflen));
+ if (!lcfg)
+ return ERR_PTR(-ENOMEM);
+
+ lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+ lcfg->lcfg_command = cmd;
+ lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+ ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+ for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+ lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+ LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
+ }
+ return lcfg;
+}
+
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+ int len;
+
+ len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+
+ OBD_FREE(lcfg, len);
+ return;
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, int len)
+{
+ struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+
+ if (!lcfg)
+ return -EINVAL;
+
+ /* check that the first bits of the struct are valid */
+ if (len < LCFG_HDR_SIZE(0))
+ return -EINVAL;
+
+ if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+ return -EINVAL;
+
+ if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+ return -EINVAL;
+
+ /* check that the buflens are valid */
+ if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+ return -EINVAL;
+
+ /* make sure all the pointers point inside the data */
+ if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+ return -EINVAL;
+
+ return 0;
+}
+
+#include "lustre/lustre_user.h"
+
+/** @} cfg */
+
+#endif /* _LUSTRE_CFG_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h
new file mode 100644
index 000000000..6c92d0bc9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_debug.h
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include "lustre_net.h"
+#include "obd.h"
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
new file mode 100644
index 000000000..9b2833131
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -0,0 +1,547 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/types.h"
+#include <linux/backing-dev.h>
+
+/****************** on-disk files *********************/
+
+#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR "CONFIGS"
+#define CONFIGS_FILE "mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD "last_rcvd"
+#define LOV_OBJID "lov_objid"
+#define LOV_OBJSEQ "lov_objseq"
+#define HEALTH_CHECK "health_check"
+#define CAPA_KEYS "capa_keys"
+#define CHANGELOG_USERS "changelog_users"
+#define MGS_NIDTBL_DIR "NIDTBL_VERSIONS"
+#define QMT_DIR "quota_master"
+#define QSD_DIR "quota_slave"
+#define HSM_ACTIONS "hsm_actions"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT 0x0001
+#define LDD_F_SV_TYPE_OST 0x0002
+#define LDD_F_SV_TYPE_MGS 0x0004
+#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \
+ LDD_F_SV_TYPE_OST | \
+ LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL 0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX 0x0010
+/** never registered */
+#define LDD_F_VIRGIN 0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE 0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD 0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF 0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14 0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM 0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE 0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE 0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR 0x4000
+/** process at lctl conf_param */
+#define LDD_F_PARAM2 0x8000
+
+/* opc for target register */
+#define LDD_F_OPC_REG 0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK 0xf0000000
+
+#define LDD_F_ONDISK_MASK (LDD_F_SV_TYPE_MASK)
+
+#define LDD_F_MASK 0xFFFF
+
+enum ldd_mount_type {
+ LDD_MT_EXT3 = 0,
+ LDD_MT_LDISKFS,
+ LDD_MT_SMFS,
+ LDD_MT_REISERFS,
+ LDD_MT_LDISKFS2,
+ LDD_MT_ZFS,
+ LDD_MT_LAST
+};
+
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+ static char *mount_type_string[] = {
+ "ext3",
+ "ldiskfs",
+ "smfs",
+ "reiserfs",
+ "ldiskfs2",
+ "zfs",
+ };
+ return mount_type_string[mt];
+}
+
+static inline char *mt_type(enum ldd_mount_type mt)
+{
+ static char *mount_type_string[] = {
+ "osd-ldiskfs",
+ "osd-ldiskfs",
+ "osd-smfs",
+ "osd-reiserfs",
+ "osd-ldiskfs",
+ "osd-zfs",
+ };
+ return mount_type_string[mt];
+}
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+ __u32 ldd_magic;
+ __u32 ldd_feature_compat; /* compatible feature flags */
+ __u32 ldd_feature_rocompat;/* read-only compatible feature flags */
+ __u32 ldd_feature_incompat;/* incompatible feature flags */
+
+ __u32 ldd_config_ver; /* config rewrite count - not used */
+ __u32 ldd_flags; /* LDD_SV_TYPE */
+ __u32 ldd_svindex; /* server index (0001), must match
+ svname */
+ __u32 ldd_mount_type; /* target fs type LDD_MT_* */
+ char ldd_fsname[64]; /* filesystem this server is part of,
+ MTI_NAME_MAXLEN */
+ char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/
+ __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */
+
+/*200*/ char ldd_userdata[1024 - 200]; /* arbitrary user string */
+/*1024*/__u8 ldd_padding[4096 - 1024];
+/*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char ldd_params[4096]; /* key=value pairs */
+};
+
+
+#define IS_MDT(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data) ((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+ LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST))
+#define MT_STR(data) mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+ char *name)
+{
+ if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+ if (!(flags & LDD_F_SV_ALL))
+ sprintf(name, "%.8s%c%s%04x", fs,
+ (flags & LDD_F_VIRGIN) ? ':' :
+ ((flags & LDD_F_WRITECONF) ? '=' : '-'),
+ (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
+ index);
+ } else if (flags & LDD_F_SV_TYPE_MGS) {
+ sprintf(name, "MGS");
+ } else {
+ CERROR("unknown server type %#x\n", flags);
+ return 1;
+ }
+ return 0;
+}
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+ everything as string options */
+
+#define LMD_MAGIC 0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+ __u32 lmd_magic;
+ __u32 lmd_flags; /* lustre mount flags */
+ int lmd_mgs_failnodes; /* mgs failover node count */
+ int lmd_exclude_count;
+ int lmd_recovery_time_soft;
+ int lmd_recovery_time_hard;
+ char *lmd_dev; /* device name */
+ char *lmd_profile; /* client only */
+ char *lmd_mgssec; /* sptlrpc flavor to mgs */
+ char *lmd_opts; /* lustre mount options (as opposed to
+ _device_ mount options) */
+ char *lmd_params; /* lustre params */
+ __u32 *lmd_exclude; /* array of OSTs to ignore */
+ char *lmd_mgs; /* MGS nid */
+ char *lmd_osd_type; /* OSD type */
+};
+
+#define LMD_FLG_SERVER 0x0001 /* Mounting a server */
+#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */
+#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */
+#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers,
+ no other services */
+#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing
+ existing MGS services */
+#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */
+#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */
+#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */
+#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */
+#define LMD_FLG_IAM 0x0400 /* IAM dir */
+#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */
+#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */
+#define LMD_FLG_UPDATE 0x2000 /* update parameters */
+#define LMD_FLG_HSM 0x4000 /* Start coordinator */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+
+/****************** last_rcvd file *********************/
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS 32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
+#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE 512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE 128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST 0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT 0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20 0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID 0x00000001
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS 0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST 0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT 0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR 0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID 0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM 0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR 0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA 0x00000080
+/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER 0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI 0x00000200
+
+/* Data stored per server at the head of the last_rcvd file. In le32 order.
+ This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+ __u8 lsd_uuid[40]; /* server UUID */
+ __u64 lsd_last_transno; /* last completed transaction ID */
+ __u64 lsd_compat14; /* reserved - compat with old last_rcvd */
+ __u64 lsd_mount_count; /* incarnation number */
+ __u32 lsd_feature_compat; /* compatible feature flags */
+ __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+ __u32 lsd_feature_incompat;/* incompatible feature flags */
+ __u32 lsd_server_size; /* size of server data area */
+ __u32 lsd_client_start; /* start of per-client data area */
+ __u16 lsd_client_size; /* size of per-client data area */
+ __u16 lsd_subdir_count; /* number of subdirectories for objects */
+ __u64 lsd_catalog_oid; /* recovery catalog object id */
+ __u32 lsd_catalog_ogen; /* recovery catalog inode generation */
+ __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */
+ __u32 lsd_osd_index; /* index number of OST in LOV */
+ __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */
+ __u32 lsd_start_epoch; /* VBR: start epoch from last boot */
+ /** transaction values since lsd_trans_table_time */
+ __u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+ /** start point of transno table below */
+ __u32 lsd_trans_table_time; /* time of first slot in table above */
+ __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+ __u8 lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file. In le32 order. */
+struct lsd_client_data {
+ __u8 lcd_uuid[40]; /* client UUID */
+ __u64 lcd_last_transno; /* last completed transaction ID */
+ __u64 lcd_last_xid; /* xid for the last transaction */
+ __u32 lcd_last_result; /* result from last RPC */
+ __u32 lcd_last_data; /* per-op data (disposition for open &c.) */
+ /* for MDS_CLOSE requests */
+ __u64 lcd_last_close_transno; /* last completed transaction ID */
+ __u64 lcd_last_close_xid; /* xid for the last transaction */
+ __u32 lcd_last_close_result; /* result from last RPC */
+ __u32 lcd_last_close_data; /* per-op data */
+ /* VBR: last versions */
+ __u64 lcd_pre_versions[4];
+ __u32 lcd_last_epoch;
+ /** orphans handling for delayed export rely on that */
+ __u32 lcd_first_epoch;
+ __u8 lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* bug20354: the lcd_uuid for export of clients may be wrong */
+static inline void check_lcd(char *obd_name, int index,
+ struct lsd_client_data *lcd)
+{
+ int length = sizeof(lcd->lcd_uuid);
+ if (strnlen((char*)lcd->lcd_uuid, length) == length) {
+ lcd->lcd_uuid[length - 1] = '\0';
+
+ LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n",
+ lcd->lcd_uuid, obd_name, index);
+ }
+}
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+ struct lr_server_data *lsd)
+{
+ int i;
+ memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+ lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno);
+ lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14);
+ lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count);
+ lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat);
+ lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+ lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+ lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size);
+ lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start);
+ lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size);
+ lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count);
+ lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid);
+ lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen);
+ memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+ lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index);
+ lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1);
+ lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch);
+ for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+ lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+ lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+ lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+ struct lr_server_data *buf)
+{
+ int i;
+ memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+ buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno);
+ buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14);
+ buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count);
+ buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat);
+ buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+ buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+ buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size);
+ buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start);
+ buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size);
+ buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count);
+ buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid);
+ buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen);
+ memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+ buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index);
+ buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1);
+ buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch);
+ for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+ buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+ buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+ buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+ struct lsd_client_data *lcd)
+{
+ memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+ lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno);
+ lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid);
+ lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result);
+ lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data);
+ lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+ lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid);
+ lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result);
+ lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data);
+ lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]);
+ lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]);
+ lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]);
+ lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]);
+ lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch);
+ lcd->lcd_first_epoch = le32_to_cpu(buf->lcd_first_epoch);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+ struct lsd_client_data *buf)
+{
+ memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+ buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno);
+ buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid);
+ buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result);
+ buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data);
+ buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+ buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid);
+ buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result);
+ buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data);
+ buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]);
+ buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]);
+ buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]);
+ buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]);
+ buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch);
+ buf->lcd_first_epoch = cpu_to_le32(lcd->lcd_first_epoch);
+}
+
+static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+ return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+ lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+ return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+ lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** superblock additional info *********************/
+
+struct ll_sb_info;
+
+struct lustre_sb_info {
+ int lsi_flags;
+ struct obd_device *lsi_mgc; /* mgc obd */
+ struct lustre_mount_data *lsi_lmd; /* mount command info */
+ struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */
+ struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/
+ struct vfsmount *lsi_srv_mnt; /* the one server mount */
+ atomic_t lsi_mounts; /* references to the srv_mnt */
+ char lsi_svname[MTI_NAME_MAXLEN];
+ char lsi_osd_obdname[64];
+ char lsi_osd_uuid[64];
+ struct obd_export *lsi_osd_exp;
+ char lsi_osd_type[16];
+ char lsi_fstype[16];
+ struct backing_dev_info lsi_bdi; /* each client mountpoint needs
+ own backing_dev_info */
+};
+
+#define LSI_UMOUNT_FAILOVER 0x00200000
+#define LSI_BDI_INITIALIZED 0x00400000
+
+#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info))
+#define s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define get_mount_flags(sb) (s2lsi(sb)->lsi_lmd->lmd_flags)
+#define get_mntdev_name(sb) (s2lsi(sb)->lsi_lmd->lmd_dev)
+
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+ char *lmi_name;
+ struct super_block *lmi_sb;
+ struct vfsmount *lmi_mnt;
+ struct list_head lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+int server_name2svname(const char *label, char *svname, const char **endptr,
+ size_t svsize);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+ char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+ struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif /* _LUSTRE_DISK_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h
new file mode 100644
index 000000000..bac9902b5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm.h
@@ -0,0 +1,1480 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ * - To provide locking assuring consistency of data on all Lustre nodes.
+ * - To allow clients to cache state protected by a lock by holding the
+ * lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include "lustre_lib.h"
+#include "lustre_net.h"
+#include "lustre_import.h"
+#include "lustre_handles.h"
+#include "interval_tree.h" /* for interval_node{}, ldlm_extent */
+#include "lu_ref.h"
+
+#include "lustre_dlm_flags.h"
+
+struct obd_ops;
+struct obd_device;
+
+#define OBD_LDLM_DEVICENAME "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+typedef enum {
+ ELDLM_OK = 0,
+
+ ELDLM_LOCK_CHANGED = 300,
+ ELDLM_LOCK_ABORTED = 301,
+ ELDLM_LOCK_REPLACED = 302,
+ ELDLM_NO_LOCK_DATA = 303,
+ ELDLM_LOCK_WOULDBLOCK = 304,
+
+ ELDLM_NAMESPACE_EXISTS = 400,
+ ELDLM_BAD_NAMESPACE = 401
+} ldlm_error_t;
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+typedef enum {
+ LDLM_NAMESPACE_SERVER = 1 << 0,
+ LDLM_NAMESPACE_CLIENT = 1 << 1
+} ldlm_side_t;
+
+/**
+ * The blocking callback is overloaded to perform two functions. These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING 1
+#define LDLM_CB_CANCELING 2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ * on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ * lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ * an OST, a lock with PR mode will be issued. Also, if the client opens a
+ * file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ * requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ * an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ * NL CR CW PR PW EX
+ * NL 1 1 1 1 1 1
+ * CR 1 1 1 1 1 0
+ * CW 1 1 1 0 0 0
+ * PR 1 1 0 1 0 0
+ * PW 1 1 0 0 0 0
+ * EX 1 0 0 0 0 0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX LCK_NL
+#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern ldlm_mode_t lck_compat_array[];
+
+static inline void lockmode_verify(ldlm_mode_t mode)
+{
+ LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode)
+{
+ return (lck_compat_array[exist_mode] & new_mode);
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+ - do we just separate this by security domains and use a prefix for
+ multiple namespaces in the same domain?
+ -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ * waiting_locks_spinlock
+ *
+ * lr_lock
+ * led_lock
+ *
+ * lr_lock
+ * ns_lock
+ *
+ * lr_lvb_mutex
+ * lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+ /** Recalculate pool \a pl usage */
+ int (*po_recalc)(struct ldlm_pool *pl);
+ /** Cancel at least \a nr locks from pool \a pl */
+ int (*po_shrink)(struct ldlm_pool *pl, int nr,
+ gfp_t gfp_mask);
+ int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+ /** Pool proc directory. */
+ struct proc_dir_entry *pl_proc_dir;
+ /** Pool name, must be long enough to hold compound proc entry name. */
+ char pl_name[100];
+ /** Lock for protecting SLV/CLV updates. */
+ spinlock_t pl_lock;
+ /** Number of allowed locks in in pool, both, client and server side. */
+ atomic_t pl_limit;
+ /** Number of granted locks in */
+ atomic_t pl_granted;
+ /** Grant rate per T. */
+ atomic_t pl_grant_rate;
+ /** Cancel rate per T. */
+ atomic_t pl_cancel_rate;
+ /** Server lock volume (SLV). Protected by pl_lock. */
+ __u64 pl_server_lock_volume;
+ /** Current biggest client lock volume. Protected by pl_lock. */
+ __u64 pl_client_lock_volume;
+ /** Lock volume factor. SLV on client is calculated as following:
+ * server_slv * lock_volume_factor. */
+ atomic_t pl_lock_volume_factor;
+ /** Time when last SLV from server was obtained. */
+ time_t pl_recalc_time;
+ /** Recalculation period for pool. */
+ time_t pl_recalc_period;
+ /** Recalculation and shrink operations. */
+ const struct ldlm_pool_ops *pl_ops;
+ /** Number of planned locks for next period. */
+ int pl_grant_plan;
+ /** Pool statistics. */
+ struct lprocfs_stats *pl_stats;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+ void *req_cookie, ldlm_mode_t mode, __u64 flags,
+ void *data);
+
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ * - OSC-OST code to maintain current object size/times
+ * - layout lock code to return the layout when the layout lock is granted
+ */
+struct ldlm_valblock_ops {
+ int (*lvbo_init)(struct ldlm_resource *res);
+ int (*lvbo_update)(struct ldlm_resource *res,
+ struct ptlrpc_request *r,
+ int increase);
+ int (*lvbo_free)(struct ldlm_resource *res);
+ /* Return size of lvb data appropriate RPC size can be reserved */
+ int (*lvbo_size)(struct ldlm_lock *lock);
+ /* Called to fill in lvb data to RPC buffer @buf */
+ int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+typedef enum {
+ LDLM_NAMESPACE_GREEDY = 1 << 0,
+ LDLM_NAMESPACE_MODEST = 1 << 1
+} ldlm_appetite_t;
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+ /** back pointer to namespace */
+ struct ldlm_namespace *nsb_namespace;
+ /**
+ * Estimated lock callback time. Used by adaptive timeout code to
+ * avoid spurious client evictions due to unresponsiveness when in
+ * fact the network or overall system load is at fault
+ */
+ struct adaptive_timeout nsb_at_estimate;
+};
+
+enum {
+ /** LDLM namespace lock stats */
+ LDLM_NSS_LOCKS = 0,
+ LDLM_NSS_LAST
+};
+
+typedef enum {
+ /** invalid type */
+ LDLM_NS_TYPE_UNKNOWN = 0,
+ /** mdc namespace */
+ LDLM_NS_TYPE_MDC,
+ /** mds namespace */
+ LDLM_NS_TYPE_MDT,
+ /** osc namespace */
+ LDLM_NS_TYPE_OSC,
+ /** ost namespace */
+ LDLM_NS_TYPE_OST,
+ /** mgc namespace */
+ LDLM_NS_TYPE_MGC,
+ /** mgs namespace */
+ LDLM_NS_TYPE_MGT,
+} ldlm_ns_type_t;
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ * to make decisions like what locks could be granted and what conflicts
+ * exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ * only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+ /** Backward link to OBD, required for LDLM pool to store new SLV. */
+ struct obd_device *ns_obd;
+
+ /** Flag indicating if namespace is on client instead of server */
+ ldlm_side_t ns_client;
+
+ /** Resource hash table for namespace. */
+ struct cfs_hash *ns_rs_hash;
+
+ /** serialize */
+ spinlock_t ns_lock;
+
+ /** big refcount (by bucket) */
+ atomic_t ns_bref;
+
+ /**
+ * Namespace connect flags supported by server (may be changed via
+ * /proc, LRU resize may be disabled/enabled).
+ */
+ __u64 ns_connect_flags;
+
+ /** Client side original connect flags supported by server. */
+ __u64 ns_orig_connect_flags;
+
+ /* namespace proc dir entry */
+ struct proc_dir_entry *ns_proc_dir_entry;
+
+ /**
+ * Position in global namespace list linking all namespaces on
+ * the node.
+ */
+ struct list_head ns_list_chain;
+
+ /**
+ * List of unused locks for this namespace. This list is also called
+ * LRU lock list.
+ * Unused locks are locks with zero reader/writer reference counts.
+ * This list is only used on clients for lock caching purposes.
+ * When we want to release some locks voluntarily or if server wants
+ * us to release some locks due to e.g. memory pressure, we take locks
+ * to release from the head of this list.
+ * Locks are linked via l_lru field in \see struct ldlm_lock.
+ */
+ struct list_head ns_unused_list;
+ /** Number of locks in the LRU list above */
+ int ns_nr_unused;
+
+ /**
+ * Maximum number of locks permitted in the LRU. If 0, means locks
+ * are managed by pools and there is no preset limit, rather it is all
+ * controlled by available memory on this client and on server.
+ */
+ unsigned int ns_max_unused;
+ /** Maximum allowed age (last used time) for locks in the LRU */
+ unsigned int ns_max_age;
+ /**
+ * Server only: number of times we evicted clients due to lack of reply
+ * to ASTs.
+ */
+ unsigned int ns_timeouts;
+ /**
+ * Number of seconds since the file change time after which the
+ * MDT will return an UPDATE lock along with a LOOKUP lock.
+ * This allows the client to start caching negative dentries
+ * for a directory and may save an RPC for a later stat.
+ */
+ unsigned int ns_ctime_age_limit;
+
+ /**
+ * Used to rate-limit ldlm_namespace_dump calls.
+ * \see ldlm_namespace_dump. Increased by 10 seconds every time
+ * it is called.
+ */
+ unsigned long ns_next_dump;
+
+ /** "policy" function that does actual lock conflict determination */
+ ldlm_res_policy ns_policy;
+
+ /**
+ * LVB operations for this namespace.
+ * \see struct ldlm_valblock_ops
+ */
+ struct ldlm_valblock_ops *ns_lvbo;
+
+ /**
+ * Used by filter code to store pointer to OBD of the service.
+ * Should be dropped in favor of \a ns_obd
+ */
+ void *ns_lvbp;
+
+ /**
+ * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+ * a resource is removed.
+ */
+ wait_queue_head_t ns_waitq;
+ /** LDLM pool structure for this namespace */
+ struct ldlm_pool ns_pool;
+ /** Definition of how eagerly unused locks will be released from LRU */
+ ldlm_appetite_t ns_appetite;
+
+ /**
+ * If more than \a ns_contended_locks are found, the resource is
+ * considered to be contended. Lock enqueues might specify that no
+ * contended locks should be granted
+ */
+ unsigned ns_contended_locks;
+
+ /**
+ * The resources in this namespace remember contended state during
+ * \a ns_contention_time, in seconds.
+ */
+ unsigned ns_contention_time;
+
+ /**
+ * Limit size of contended extent locks, in bytes.
+ * If extended lock is requested for more then this many bytes and
+ * caller instructs us not to grant contended locks, we would disregard
+ * such a request.
+ */
+ unsigned ns_max_nolock_size;
+
+ /** Limit of parallel AST RPC count. */
+ unsigned ns_max_parallel_ast;
+
+ /** Callback to cancel locks before replaying it during recovery. */
+ ldlm_cancel_for_recovery ns_cancel_for_recovery;
+
+ /** LDLM lock stats */
+ struct lprocfs_stats *ns_stats;
+
+ /**
+ * Flag to indicate namespace is being freed. Used to determine if
+ * recalculation of LDLM pool statistics should be skipped.
+ */
+ unsigned ns_stopping:1;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+ LDLM_NAMESPACE_SERVER)));
+ LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+ ns->ns_client == LDLM_NAMESPACE_SERVER);
+ return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+ LDLM_NAMESPACE_SERVER)));
+ LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+ ns->ns_client == LDLM_NAMESPACE_SERVER);
+ return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+ LASSERT(ns != NULL);
+ return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+ ldlm_cancel_for_recovery arg)
+{
+ LASSERT(ns != NULL);
+ ns->ns_cancel_for_recovery = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+ struct ldlm_lock_desc *new, void *data,
+ int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+ void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+ struct ldlm_lock *gl_lock; /* lock to glimpse */
+ struct list_head gl_list; /* linkage to other gl work structs */
+ __u32 gl_flags;/* see LDLM_GL_WORK_* below */
+ union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in
+ * glimpse callback request */
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+ struct interval_node li_node; /* node for tree management */
+ struct list_head li_group; /* the locks which have the same
+ * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+ /** Tree size. */
+ int lit_size;
+ ldlm_mode_t lit_mode; /* lock mode */
+ struct interval_node *lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+typedef enum {
+ LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */
+ LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */
+ LCF_BL_AST = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST
+ * in the same RPC */
+} ldlm_cancel_flags_t;
+
+struct ldlm_flock {
+ __u64 start;
+ __u64 end;
+ __u64 owner;
+ __u64 blocking_owner;
+ struct obd_export *blocking_export;
+ /* Protected by the hash lock */
+ __u32 blocking_refs;
+ __u32 pid;
+};
+
+typedef union {
+ struct ldlm_extent l_extent;
+ struct ldlm_flock l_flock;
+ struct ldlm_inodebits l_inodebits;
+} ldlm_policy_data_t;
+
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+ const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+ const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy);
+
+enum lvb_type {
+ LVB_T_NONE = 0,
+ LVB_T_OST = 1,
+ LVB_T_LQUOTA = 2,
+ LVB_T_LAYOUT = 3,
+};
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+ /**
+ * Local lock handle.
+ * When remote side wants to tell us about a lock, they address
+ * it by this opaque handle. The handle does not hold a
+ * reference on the ldlm_lock, so it can be safely passed to
+ * other threads or nodes. When the lock needs to be accessed
+ * from the handle, it is looked up again in the lock table, and
+ * may no longer exist.
+ *
+ * Must be first in the structure.
+ */
+ struct portals_handle l_handle;
+ /**
+ * Lock reference count.
+ * This is how many users have pointers to actual structure, so that
+ * we do not accidentally free lock structure that is in use.
+ */
+ atomic_t l_refc;
+ /**
+ * Internal spinlock protects l_resource. We should hold this lock
+ * first before taking res_lock.
+ */
+ spinlock_t l_lock;
+ /**
+ * Pointer to actual resource this lock is in.
+ * ldlm_lock_change_resource() can change this.
+ */
+ struct ldlm_resource *l_resource;
+ /**
+ * List item for client side LRU list.
+ * Protected by ns_lock in struct ldlm_namespace.
+ */
+ struct list_head l_lru;
+ /**
+ * Linkage to resource's lock queues according to current lock state.
+ * (could be granted, waiting or converting)
+ * Protected by lr_lock in struct ldlm_resource.
+ */
+ struct list_head l_res_link;
+ /**
+ * Tree node for ldlm_extent.
+ */
+ struct ldlm_interval *l_tree_node;
+ /**
+ * Per export hash of locks.
+ * Protected by per-bucket exp->exp_lock_hash locks.
+ */
+ struct hlist_node l_exp_hash;
+ /**
+ * Per export hash of flock locks.
+ * Protected by per-bucket exp->exp_flock_hash locks.
+ */
+ struct hlist_node l_exp_flock_hash;
+ /**
+ * Requested mode.
+ * Protected by lr_lock.
+ */
+ ldlm_mode_t l_req_mode;
+ /**
+ * Granted mode, also protected by lr_lock.
+ */
+ ldlm_mode_t l_granted_mode;
+ /** Lock completion handler pointer. Called when lock is granted. */
+ ldlm_completion_callback l_completion_ast;
+ /**
+ * Lock blocking AST handler pointer.
+ * It plays two roles:
+ * - as a notification of an attempt to queue a conflicting lock (once)
+ * - as a notification when the lock is being cancelled.
+ *
+ * As such it's typically called twice: once for the initial conflict
+ * and then once more when the last user went away and the lock is
+ * cancelled (could happen recursively).
+ */
+ ldlm_blocking_callback l_blocking_ast;
+ /**
+ * Lock glimpse handler.
+ * Glimpse handler is used to obtain LVB updates from a client by
+ * server
+ */
+ ldlm_glimpse_callback l_glimpse_ast;
+
+ /**
+ * Lock export.
+ * This is a pointer to actual client export for locks that were granted
+ * to clients. Used server-side.
+ */
+ struct obd_export *l_export;
+ /**
+ * Lock connection export.
+ * Pointer to server export on a client.
+ */
+ struct obd_export *l_conn_export;
+
+ /**
+ * Remote lock handle.
+ * If the lock is remote, this is the handle of the other side lock
+ * (l_handle)
+ */
+ struct lustre_handle l_remote_handle;
+
+ /**
+ * Representation of private data specific for a lock type.
+ * Examples are: extent range for extent lock or bitmask for ibits locks
+ */
+ ldlm_policy_data_t l_policy_data;
+
+ /**
+ * Lock state flags. Protected by lr_lock.
+ * \see lustre_dlm_flags.h where the bits are defined.
+ */
+ __u64 l_flags;
+
+ /**
+ * Lock r/w usage counters.
+ * Protected by lr_lock.
+ */
+ __u32 l_readers;
+ __u32 l_writers;
+ /**
+ * If the lock is granted, a process sleeps on this waitq to learn when
+ * it's no longer in use. If the lock is not granted, a process sleeps
+ * on this waitq to learn when it becomes granted.
+ */
+ wait_queue_head_t l_waitq;
+
+ /**
+ * Seconds. It will be updated if there is any activity related to
+ * the lock, e.g. enqueue the lock or send blocking AST.
+ */
+ unsigned long l_last_activity;
+
+ /**
+ * Time last used by e.g. being matched by lock match.
+ * Jiffies. Should be converted to time if needed.
+ */
+ unsigned long l_last_used;
+
+ /** Originally requested extent for the extent lock. */
+ struct ldlm_extent l_req_extent;
+
+ /*
+ * Client-side-only members.
+ */
+
+ enum lvb_type l_lvb_type;
+
+ /**
+ * Temporary storage for a LVB received during an enqueue operation.
+ */
+ __u32 l_lvb_len;
+ void *l_lvb_data;
+
+ /** Private storage for lock user. Opaque to LDLM. */
+ void *l_ast_data;
+
+ /*
+ * Server-side-only members.
+ */
+
+ /**
+ * Connection cookie for the client originating the operation.
+ * Used by Commit on Share (COS) code. Currently only used for
+ * inodebits locks on MDS.
+ */
+ __u64 l_client_cookie;
+
+ /**
+ * List item for locks waiting for cancellation from clients.
+ * The lists this could be linked into are:
+ * waiting_locks_list (protected by waiting_locks_spinlock),
+ * then if the lock timed out, it is moved to
+ * expired_lock_thread.elt_expired_locks for further processing.
+ * Protected by elt_lock.
+ */
+ struct list_head l_pending_chain;
+
+ /**
+ * Set when lock is sent a blocking AST. Time in seconds when timeout
+ * is reached and client holding this lock could be evicted.
+ * This timeout could be further extended by e.g. certain IO activity
+ * under this lock.
+ * \see ost_rw_prolong_locks
+ */
+ unsigned long l_callback_timeout;
+
+ /** Local PID of process which created this lock. */
+ __u32 l_pid;
+
+ /**
+ * Number of times blocking AST was sent for this lock.
+ * This is for debugging. Valid values are 0 and 1, if there is an
+ * attempt to send blocking AST more than once, an assertion would be
+ * hit. \see ldlm_work_bl_ast_lock
+ */
+ int l_bl_ast_run;
+ /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+ struct list_head l_bl_ast;
+ /** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+ struct list_head l_cp_ast;
+ /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+ struct list_head l_rk_ast;
+
+ /**
+ * Pointer to a conflicting lock that caused blocking AST to be sent
+ * for this lock
+ */
+ struct ldlm_lock *l_blocking_lock;
+
+ /**
+ * Protected by lr_lock, linkages to "skip lists".
+ * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+ */
+ struct list_head l_sl_mode;
+ struct list_head l_sl_policy;
+
+ /** Reference tracking structure to debug leaked locks. */
+ struct lu_ref l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ /* Debugging stuff for bug 20498, for tracking export references. */
+ /** number of export references taken */
+ int l_exp_refs_nr;
+ /** link all locks referencing one export */
+ struct list_head l_exp_refs_link;
+ /** referenced export object */
+ struct obd_export *l_exp_refs_target;
+#endif
+ /**
+ * export blocking dlm lock list, protected by
+ * l_export->exp_bl_list_lock.
+ * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+ * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+ */
+ struct list_head l_exp_list;
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+ struct ldlm_ns_bucket *lr_ns_bucket;
+
+ /**
+ * List item for list in namespace hash.
+ * protected by ns_lock
+ */
+ struct hlist_node lr_hash;
+
+ /** Spinlock to protect locks under this resource. */
+ spinlock_t lr_lock;
+
+ /**
+ * protected by lr_lock
+ * @{ */
+ /** List of locks in granted state */
+ struct list_head lr_granted;
+ /** List of locks waiting to change their granted mode (converted) */
+ struct list_head lr_converting;
+ /**
+ * List of locks that could not be granted due to conflicts and
+ * that are waiting for conflicts to go away */
+ struct list_head lr_waiting;
+ /** @} */
+
+ /* XXX No longer needed? Remove ASAP */
+ ldlm_mode_t lr_most_restr;
+
+ /** Type of locks this resource can hold. Only one type per resource. */
+ ldlm_type_t lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+ /** Resource name */
+ struct ldlm_res_id lr_name;
+ /** Reference count for this resource */
+ atomic_t lr_refcount;
+
+ /**
+ * Interval trees (only for extent locks) for all modes of this resource
+ */
+ struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];
+
+ /**
+ * Server-side-only lock value block elements.
+ * To serialize lvbo_init.
+ */
+ struct mutex lr_lvb_mutex;
+ int lr_lvb_len;
+ /** protected by lr_lock */
+ void *lr_lvb_data;
+
+ /** When the resource was considered as contended. */
+ unsigned long lr_contention_time;
+ /** List of references to this resource. For debugging. */
+ struct lu_ref lr_reference;
+
+ struct inode *lr_lvb_inode;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+ return lock->l_resource->lr_type == LDLM_IBITS &&
+ lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+ return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+ return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+ return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+ return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+ return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+ struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+ if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL)
+ return ns->ns_lvbo->lvbo_init(res);
+
+ return 0;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+ return ns->ns_lvbo->lvbo_size(lock);
+
+ return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ if (ns->ns_lvbo != NULL) {
+ LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+ return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+ }
+ return 0;
+}
+
+struct ldlm_ast_work {
+ struct ldlm_lock *w_lock;
+ int w_blocking;
+ struct ldlm_lock_desc w_desc;
+ struct list_head w_list;
+ int w_flags;
+ void *w_data;
+ int w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+ __u32 ei_type; /** Type of the lock being enqueued. */
+ __u32 ei_mode; /** Mode of the lock being enqueued. */
+ void *ei_cb_bl; /** blocking lock callback */
+ void *ei_cb_cp; /** lock completion callback */
+ void *ei_cb_gl; /** lock glimpse callback */
+ void *ei_cbdata; /** Data to be passed into callbacks. */
+};
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern char *ldlm_it2str(int it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...) \
+ CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \
+ CFS_CHECK_STACK(msgdata, mask, cdls); \
+ \
+ if (((mask) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (mask)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \
+ _ldlm_lock_debug(lock, msgdata, fmt, ##a); \
+} while (0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+ struct libcfs_debug_msg_data *data,
+ const char *fmt, ...)
+ __printf(3, 4);
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \
+ static struct cfs_debug_limit_state _ldlm_cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \
+ ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...) do { \
+ if (likely(lock != NULL)) { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \
+ ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \
+ "### " fmt , ##a); \
+ } else { \
+ LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \
+ } \
+} while (0)
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+ int first_enq, ldlm_error_t *err,
+ struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP 2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+ void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+ void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+ ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_callback_suite {
+ ldlm_completion_callback lcs_completion;
+ ldlm_blocking_callback lcs_blocking;
+ ldlm_glimpse_callback lcs_glimpse;
+};
+
+/* ldlm_lockd.c */
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+ struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(struct lustre_handle *, void *);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+ return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+ lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+ struct ldlm_lock *lock;
+
+ lock = __ldlm_handle2lock(h, flags);
+ if (lock != NULL)
+ LDLM_LOCK_REF_DEL(lock);
+ return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from request \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+ struct ptlrpc_request *r, int increase)
+{
+ if (ldlm_res_to_ns(res)->ns_lvbo &&
+ ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+ return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r,
+ increase);
+ }
+ return 0;
+}
+
+int ldlm_error2errno(ldlm_error_t error);
+ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this
+ * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock) \
+do { \
+ LDLM_LOCK_REF_DEL(lock); \
+ /*LDLM_DEBUG((lock), "put");*/ \
+ ldlm_lock_put(lock); \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock) \
+do { \
+ /*LDLM_DEBUG((lock), "put");*/ \
+ ldlm_lock_put(lock); \
+} while (0)
+
+#define LDLM_LOCK_GET(lock) \
+({ \
+ ldlm_lock_get(lock); \
+ /*LDLM_DEBUG((lock), "get");*/ \
+ lock; \
+})
+
+#define ldlm_lock_list_put(head, member, count) \
+({ \
+ struct ldlm_lock *_lock, *_next; \
+ int c = count; \
+ list_for_each_entry_safe(_lock, _next, head, member) { \
+ if (c-- == 0) \
+ break; \
+ list_del_init(&_lock->member); \
+ LDLM_LOCK_RELEASE(_lock); \
+ } \
+ LASSERT(c <= 0); \
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode);
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+ const struct ldlm_res_id *, ldlm_type_t type,
+ ldlm_policy_data_t *, ldlm_mode_t mode,
+ struct lustre_handle *, int unref);
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+ __u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+ __u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *
+ldlm_namespace_new(struct obd_device *obd, char *name,
+ ldlm_side_t client, ldlm_appetite_t apt,
+ ldlm_ns_type_t ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+ struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+#if defined (CONFIG_PROC_FS)
+int ldlm_proc_setup(void);
+void ldlm_proc_cleanup(void);
+#else
+static inline int ldlm_proc_setup(void) { return 0; }
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+ struct ldlm_resource *parent,
+ const struct ldlm_res_id *,
+ ldlm_type_t type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+ struct list_head *head,
+ struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+ const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do { \
+ lu_ref_add_atomic(&(res)->lr_reference, __func__, current); \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do { \
+ lu_ref_del(&(res)->lr_reference, __func__, current); \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+ void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+ struct ldlm_enqueue_info *einfo,
+ const struct ldlm_res_id *res_id,
+ ldlm_policy_data_t const *policy, __u64 *flags,
+ void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+ struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ struct list_head *cancels,
+ int count);
+int ldlm_prep_elc_req(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ int version, int opc, int canceloff,
+ struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+ const struct ldlm_request *dlm_req,
+ const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+ ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+ __u64 *flags, void *lvb, __u32 lvb_len,
+ struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_type_t type, ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, __u64 *flags,
+ ldlm_blocking_callback blocking,
+ ldlm_completion_callback completion,
+ ldlm_glimpse_callback glimpse,
+ void *data, __u32 lvb_len, enum lvb_type lvb_type,
+ const __u64 *client_cookie,
+ struct lustre_handle *lockh);
+int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
+ void *data, __u32 data_len);
+int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+ ldlm_cancel_flags_t cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+ ldlm_cancel_flags_t flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags,
+ void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+ int count, ldlm_cancel_flags_t flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+ struct list_head *cancels,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, __u64 lock_flags,
+ ldlm_cancel_flags_t cancel_flags, void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+ ldlm_cancel_flags_t flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+ struct ptlrpc_request *req, ldlm_cancel_flags_t flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+
+/* ioctls for trying requests */
+#define IOC_LDLM_TYPE 'f'
+#define IOC_LDLM_MIN_NR 40
+
+#define IOC_LDLM_TEST _IOWR('f', 40, long)
+#define IOC_LDLM_DUMP _IOWR('f', 41, long)
+#define IOC_LDLM_REGRESS_START _IOWR('f', 42, long)
+#define IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long)
+#define IOC_LDLM_MAX_NR 43
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+ LRT_NORMAL,
+ LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+ spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+ enum lock_res_type mode)
+{
+ spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+ spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+ assert_spin_locked(&res->lr_lock);
+}
+
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+int ldlm_pools_recalc(ldlm_side_t client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+ int idx, ldlm_side_t client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+ gfp_t gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+#endif
+/** @} LDLM */
diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
new file mode 100644
index 000000000..16dcdbfae
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h
@@ -0,0 +1,476 @@
+/* -*- buffer-read-only: t -*- vi: set ro:
+ *
+ * DO NOT EDIT THIS FILE (lustre_dlm_flags.h)
+ *
+ * It has been AutoGen-ed
+ * From the definitions lustre_dlm_flags.def
+ * and the template file lustre_dlm_flags.tpl
+ *
+ * lustre is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lustre is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+/**
+ * \file lustre_dlm_flags.h
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ * This file is derived from flag definitions in lustre_dlm_flags.def.
+ * The format is defined in the lustre_dlm_flags.tpl template file.
+ *
+ * \addtogroup LDLM Lustre Distributed Lock Manager
+ * @{
+ *
+ * \name flags
+ * The flags and collections of flags (masks) for \see struct ldlm_lock.
+ * @{
+ */
+#ifndef LDLM_ALL_FLAGS_MASK
+
+/** l_flags bits marked as "all_flags" bits */
+#define LDLM_FL_ALL_FLAGS_MASK 0x00FFFFFFC08F932FULL
+
+/** l_flags bits marked as "ast" bits */
+#define LDLM_FL_AST_MASK 0x0000000080008000ULL
+
+/** l_flags bits marked as "blocked" bits */
+#define LDLM_FL_BLOCKED_MASK 0x000000000000000EULL
+
+/** l_flags bits marked as "gone" bits */
+#define LDLM_FL_GONE_MASK 0x0006004000000000ULL
+
+/** l_flags bits marked as "hide_lock" bits */
+#define LDLM_FL_HIDE_LOCK_MASK 0x0000206400000000ULL
+
+/** l_flags bits marked as "inherit" bits */
+#define LDLM_FL_INHERIT_MASK 0x0000000000800000ULL
+
+/** l_flags bits marked as "local_only" bits */
+#define LDLM_FL_LOCAL_ONLY_MASK 0x00FFFFFF00000000ULL
+
+/** l_flags bits marked as "on_wire" bits */
+#define LDLM_FL_ON_WIRE_MASK 0x00000000C08F932FULL
+
+/** extent, mode, or resource changed */
+#define LDLM_FL_LOCK_CHANGED 0x0000000000000001ULL // bit 0
+#define ldlm_is_lock_changed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 0)
+#define ldlm_set_lock_changed(_l) LDLM_SET_FLAG(( _l), 1ULL << 0)
+#define ldlm_clear_lock_changed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 0)
+
+/**
+ * Server placed lock on granted list, or a recovering client wants the
+ * lock added to the granted list, no questions asked. */
+#define LDLM_FL_BLOCK_GRANTED 0x0000000000000002ULL // bit 1
+#define ldlm_is_block_granted(_l) LDLM_TEST_FLAG(( _l), 1ULL << 1)
+#define ldlm_set_block_granted(_l) LDLM_SET_FLAG(( _l), 1ULL << 1)
+#define ldlm_clear_block_granted(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 1)
+
+/**
+ * Server placed lock on conv list, or a recovering client wants the lock
+ * added to the conv list, no questions asked. */
+#define LDLM_FL_BLOCK_CONV 0x0000000000000004ULL // bit 2
+#define ldlm_is_block_conv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 2)
+#define ldlm_set_block_conv(_l) LDLM_SET_FLAG(( _l), 1ULL << 2)
+#define ldlm_clear_block_conv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 2)
+
+/**
+ * Server placed lock on wait list, or a recovering client wants the lock
+ * added to the wait list, no questions asked. */
+#define LDLM_FL_BLOCK_WAIT 0x0000000000000008ULL // bit 3
+#define ldlm_is_block_wait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 3)
+#define ldlm_set_block_wait(_l) LDLM_SET_FLAG(( _l), 1ULL << 3)
+#define ldlm_clear_block_wait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 3)
+
+/** blocking or cancel packet was queued for sending. */
+#define LDLM_FL_AST_SENT 0x0000000000000020ULL // bit 5
+#define ldlm_is_ast_sent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 5)
+#define ldlm_set_ast_sent(_l) LDLM_SET_FLAG(( _l), 1ULL << 5)
+#define ldlm_clear_ast_sent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 5)
+
+/**
+ * Lock is being replayed. This could probably be implied by the fact that
+ * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. */
+#define LDLM_FL_REPLAY 0x0000000000000100ULL // bit 8
+#define ldlm_is_replay(_l) LDLM_TEST_FLAG(( _l), 1ULL << 8)
+#define ldlm_set_replay(_l) LDLM_SET_FLAG(( _l), 1ULL << 8)
+#define ldlm_clear_replay(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 8)
+
+/** Don't grant lock, just do intent. */
+#define LDLM_FL_INTENT_ONLY 0x0000000000000200ULL // bit 9
+#define ldlm_is_intent_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 9)
+#define ldlm_set_intent_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 9)
+#define ldlm_clear_intent_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 9)
+
+/** lock request has intent */
+#define LDLM_FL_HAS_INTENT 0x0000000000001000ULL // bit 12
+#define ldlm_is_has_intent(_l) LDLM_TEST_FLAG(( _l), 1ULL << 12)
+#define ldlm_set_has_intent(_l) LDLM_SET_FLAG(( _l), 1ULL << 12)
+#define ldlm_clear_has_intent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 12)
+
+/** flock deadlock detected */
+#define LDLM_FL_FLOCK_DEADLOCK 0x0000000000008000ULL /* bit 15 */
+#define ldlm_is_flock_deadlock(_l) LDLM_TEST_FLAG((_l), 1ULL << 15)
+#define ldlm_set_flock_deadlock(_l) LDLM_SET_FLAG((_l), 1ULL << 15)
+#define ldlm_clear_flock_deadlock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 15)
+
+/** discard (no writeback) on cancel */
+#define LDLM_FL_DISCARD_DATA 0x0000000000010000ULL // bit 16
+#define ldlm_is_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 16)
+#define ldlm_set_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 16)
+#define ldlm_clear_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 16)
+
+/** Blocked by group lock - wait indefinitely */
+#define LDLM_FL_NO_TIMEOUT 0x0000000000020000ULL // bit 17
+#define ldlm_is_no_timeout(_l) LDLM_TEST_FLAG(( _l), 1ULL << 17)
+#define ldlm_set_no_timeout(_l) LDLM_SET_FLAG(( _l), 1ULL << 17)
+#define ldlm_clear_no_timeout(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 17)
+
+/**
+ * Server told not to wait if blocked. For AGL, OST will not send glimpse
+ * callback. */
+#define LDLM_FL_BLOCK_NOWAIT 0x0000000000040000ULL // bit 18
+#define ldlm_is_block_nowait(_l) LDLM_TEST_FLAG(( _l), 1ULL << 18)
+#define ldlm_set_block_nowait(_l) LDLM_SET_FLAG(( _l), 1ULL << 18)
+#define ldlm_clear_block_nowait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 18)
+
+/** return blocking lock */
+#define LDLM_FL_TEST_LOCK 0x0000000000080000ULL // bit 19
+#define ldlm_is_test_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 19)
+#define ldlm_set_test_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 19)
+#define ldlm_clear_test_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 19)
+
+/**
+ * Immediately cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This
+ * is for clients (like liblustre) that cannot be expected to reliably
+ * response to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK 0x0000000000800000ULL // bit 23
+#define ldlm_is_cancel_on_block(_l) LDLM_TEST_FLAG(( _l), 1ULL << 23)
+#define ldlm_set_cancel_on_block(_l) LDLM_SET_FLAG(( _l), 1ULL << 23)
+#define ldlm_clear_cancel_on_block(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 23)
+
+/**
+ * measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION 0x0000000040000000ULL // bit 30
+#define ldlm_is_deny_on_contention(_l) LDLM_TEST_FLAG(( _l), 1ULL << 30)
+#define ldlm_set_deny_on_contention(_l) LDLM_SET_FLAG(( _l), 1ULL << 30)
+#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30)
+
+/**
+ * These are flags that are mapped into the flags and ASTs of blocking
+ * locks Add FL_DISCARD to blocking ASTs */
+#define LDLM_FL_AST_DISCARD_DATA 0x0000000080000000ULL // bit 31
+#define ldlm_is_ast_discard_data(_l) LDLM_TEST_FLAG(( _l), 1ULL << 31)
+#define ldlm_set_ast_discard_data(_l) LDLM_SET_FLAG(( _l), 1ULL << 31)
+#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31)
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep emulation
+ * + race with upcoming bl_ast. */
+#define LDLM_FL_FAIL_LOC 0x0000000100000000ULL // bit 32
+#define ldlm_is_fail_loc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 32)
+#define ldlm_set_fail_loc(_l) LDLM_SET_FLAG(( _l), 1ULL << 32)
+#define ldlm_clear_fail_loc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 32)
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it. */
+#define LDLM_FL_SKIPPED 0x0000000200000000ULL // bit 33
+#define ldlm_is_skipped(_l) LDLM_TEST_FLAG(( _l), 1ULL << 33)
+#define ldlm_set_skipped(_l) LDLM_SET_FLAG(( _l), 1ULL << 33)
+#define ldlm_clear_skipped(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 33)
+
+/** this lock is being destroyed */
+#define LDLM_FL_CBPENDING 0x0000000400000000ULL // bit 34
+#define ldlm_is_cbpending(_l) LDLM_TEST_FLAG(( _l), 1ULL << 34)
+#define ldlm_set_cbpending(_l) LDLM_SET_FLAG(( _l), 1ULL << 34)
+#define ldlm_clear_cbpending(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 34)
+
+/** not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC 0x0000000800000000ULL // bit 35
+#define ldlm_is_wait_noreproc(_l) LDLM_TEST_FLAG(( _l), 1ULL << 35)
+#define ldlm_set_wait_noreproc(_l) LDLM_SET_FLAG(( _l), 1ULL << 35)
+#define ldlm_clear_wait_noreproc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 35)
+
+/** cancellation callback already run */
+#define LDLM_FL_CANCEL 0x0000001000000000ULL // bit 36
+#define ldlm_is_cancel(_l) LDLM_TEST_FLAG(( _l), 1ULL << 36)
+#define ldlm_set_cancel(_l) LDLM_SET_FLAG(( _l), 1ULL << 36)
+#define ldlm_clear_cancel(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 36)
+
+/** whatever it might mean */
+#define LDLM_FL_LOCAL_ONLY 0x0000002000000000ULL // bit 37
+#define ldlm_is_local_only(_l) LDLM_TEST_FLAG(( _l), 1ULL << 37)
+#define ldlm_set_local_only(_l) LDLM_SET_FLAG(( _l), 1ULL << 37)
+#define ldlm_clear_local_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 37)
+
+/** don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED 0x0000004000000000ULL // bit 38
+#define ldlm_is_failed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 38)
+#define ldlm_set_failed(_l) LDLM_SET_FLAG(( _l), 1ULL << 38)
+#define ldlm_clear_failed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 38)
+
+/** lock cancel has already been sent */
+#define LDLM_FL_CANCELING 0x0000008000000000ULL // bit 39
+#define ldlm_is_canceling(_l) LDLM_TEST_FLAG(( _l), 1ULL << 39)
+#define ldlm_set_canceling(_l) LDLM_SET_FLAG(( _l), 1ULL << 39)
+#define ldlm_clear_canceling(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 39)
+
+/** local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL 0x0000010000000000ULL // bit 40
+#define ldlm_is_local(_l) LDLM_TEST_FLAG(( _l), 1ULL << 40)
+#define ldlm_set_local(_l) LDLM_SET_FLAG(( _l), 1ULL << 40)
+#define ldlm_clear_local(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 40)
+
+/**
+ * XXX FIXME: This is being added to b_size as a low-risk fix to the
+ * fact that the LVB filling happens _after_ the lock has been granted,
+ * so another thread can match it before the LVB has been updated. As a
+ * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and
+ * callers must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST,
+ * which can be replaced with a LVB-aware wrapping function for OSC locks.
+ * That change is pretty high-risk, though, and would need a lot more
+ * testing. */
+#define LDLM_FL_LVB_READY 0x0000020000000000ULL // bit 41
+#define ldlm_is_lvb_ready(_l) LDLM_TEST_FLAG(( _l), 1ULL << 41)
+#define ldlm_set_lvb_ready(_l) LDLM_SET_FLAG(( _l), 1ULL << 41)
+#define ldlm_clear_lvb_ready(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 41)
+
+/**
+ * A lock contributes to the known minimum size (KMS) calculation until it
+ * has finished the part of its cancellation that performs write back on its
+ * dirty pages. It can remain on the granted list during this whole time.
+ * Threads racing to update the KMS after performing their writeback need
+ * to know to exclude each other's locks from the calculation as they walk
+ * the granted list. */
+#define LDLM_FL_KMS_IGNORE 0x0000040000000000ULL // bit 42
+#define ldlm_is_kms_ignore(_l) LDLM_TEST_FLAG(( _l), 1ULL << 42)
+#define ldlm_set_kms_ignore(_l) LDLM_SET_FLAG(( _l), 1ULL << 42)
+#define ldlm_clear_kms_ignore(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 42)
+
+/** completion AST to be executed */
+#define LDLM_FL_CP_REQD 0x0000080000000000ULL // bit 43
+#define ldlm_is_cp_reqd(_l) LDLM_TEST_FLAG(( _l), 1ULL << 43)
+#define ldlm_set_cp_reqd(_l) LDLM_SET_FLAG(( _l), 1ULL << 43)
+#define ldlm_clear_cp_reqd(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 43)
+
+/** cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED 0x0000100000000000ULL // bit 44
+#define ldlm_is_cleaned(_l) LDLM_TEST_FLAG(( _l), 1ULL << 44)
+#define ldlm_set_cleaned(_l) LDLM_SET_FLAG(( _l), 1ULL << 44)
+#define ldlm_clear_cleaned(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 44)
+
+/**
+ * optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB 0x0000200000000000ULL // bit 45
+#define ldlm_is_atomic_cb(_l) LDLM_TEST_FLAG(( _l), 1ULL << 45)
+#define ldlm_set_atomic_cb(_l) LDLM_SET_FLAG(( _l), 1ULL << 45)
+#define ldlm_clear_atomic_cb(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 45)
+
+/**
+ * It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting locks
+ * to this client for the first operation, whereas the second operation
+ * has canceled this lock and is waiting for rpc_lock which is taken by
+ * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in
+ * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it.
+ *
+ * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock cache is
+ * dropped to let ldlm_callback_handler() return EINVAL to the server. It
+ * is used when ELC RPC is already prepared and is waiting for rpc_lock,
+ * too late to send a separate CANCEL RPC. */
+#define LDLM_FL_BL_AST 0x0000400000000000ULL // bit 46
+#define ldlm_is_bl_ast(_l) LDLM_TEST_FLAG(( _l), 1ULL << 46)
+#define ldlm_set_bl_ast(_l) LDLM_SET_FLAG(( _l), 1ULL << 46)
+#define ldlm_clear_bl_ast(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 46)
+
+/** whatever it might mean */
+#define LDLM_FL_BL_DONE 0x0000800000000000ULL // bit 47
+#define ldlm_is_bl_done(_l) LDLM_TEST_FLAG(( _l), 1ULL << 47)
+#define ldlm_set_bl_done(_l) LDLM_SET_FLAG(( _l), 1ULL << 47)
+#define ldlm_clear_bl_done(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 47)
+
+/**
+ * Don't put lock into the LRU list, so that it is not canceled due
+ * to aging. Used by MGC locks, they are cancelled only at unmount or
+ * by callback. */
+#define LDLM_FL_NO_LRU 0x0001000000000000ULL // bit 48
+#define ldlm_is_no_lru(_l) LDLM_TEST_FLAG(( _l), 1ULL << 48)
+#define ldlm_set_no_lru(_l) LDLM_SET_FLAG(( _l), 1ULL << 48)
+#define ldlm_clear_no_lru(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 48)
+
+/**
+ * Set for locks that failed and where the server has been notified.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_FAIL_NOTIFIED 0x0002000000000000ULL // bit 49
+#define ldlm_is_fail_notified(_l) LDLM_TEST_FLAG(( _l), 1ULL << 49)
+#define ldlm_set_fail_notified(_l) LDLM_SET_FLAG(( _l), 1ULL << 49)
+#define ldlm_clear_fail_notified(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 49)
+
+/**
+ * Set for locks that were removed from class hash table and will
+ * be destroyed when last reference to them is released. Set by
+ * ldlm_lock_destroy_internal().
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_DESTROYED 0x0004000000000000ULL // bit 50
+#define ldlm_is_destroyed(_l) LDLM_TEST_FLAG(( _l), 1ULL << 50)
+#define ldlm_set_destroyed(_l) LDLM_SET_FLAG(( _l), 1ULL << 50)
+#define ldlm_clear_destroyed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 50)
+
+/** flag whether this is a server namespace lock */
+#define LDLM_FL_SERVER_LOCK 0x0008000000000000ULL // bit 51
+#define ldlm_is_server_lock(_l) LDLM_TEST_FLAG(( _l), 1ULL << 51)
+#define ldlm_set_server_lock(_l) LDLM_SET_FLAG(( _l), 1ULL << 51)
+#define ldlm_clear_server_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 51)
+
+/**
+ * It's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+ *
+ * NB: compared with check_res_locked(), checking this bit is cheaper.
+ * Also, spin_is_locked() is deprecated for kernel code; one reason is
+ * because it works only for SMP so user needs to add extra macros like
+ * LASSERT_SPIN_LOCKED for uniprocessor kernels. */
+#define LDLM_FL_RES_LOCKED 0x0010000000000000ULL // bit 52
+#define ldlm_is_res_locked(_l) LDLM_TEST_FLAG(( _l), 1ULL << 52)
+#define ldlm_set_res_locked(_l) LDLM_SET_FLAG(( _l), 1ULL << 52)
+#define ldlm_clear_res_locked(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 52)
+
+/**
+ * It's set once we call ldlm_add_waiting_lock_res_locked() to start the
+ * lock-timeout timer and it will never be reset.
+ *
+ * Protected by lock and resource locks. */
+#define LDLM_FL_WAITED 0x0020000000000000ULL // bit 53
+#define ldlm_is_waited(_l) LDLM_TEST_FLAG(( _l), 1ULL << 53)
+#define ldlm_set_waited(_l) LDLM_SET_FLAG(( _l), 1ULL << 53)
+#define ldlm_clear_waited(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 53)
+
+/** Flag whether this is a server namespace lock. */
+#define LDLM_FL_NS_SRV 0x0040000000000000ULL // bit 54
+#define ldlm_is_ns_srv(_l) LDLM_TEST_FLAG(( _l), 1ULL << 54)
+#define ldlm_set_ns_srv(_l) LDLM_SET_FLAG(( _l), 1ULL << 54)
+#define ldlm_clear_ns_srv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 54)
+
+/** Flag whether this lock can be reused. Used by exclusive open. */
+#define LDLM_FL_EXCL 0x0080000000000000ULL /* bit 55 */
+#define ldlm_is_excl(_l) LDLM_TEST_FLAG((_l), 1ULL << 55)
+#define ldlm_set_excl(_l) LDLM_SET_FLAG((_l), 1ULL << 55)
+#define ldlm_clear_excl(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 55)
+
+/** test for ldlm_lock flag bit set */
+#define LDLM_TEST_FLAG(_l, _b) (((_l)->l_flags & (_b)) != 0)
+
+/** set a ldlm_lock flag bit */
+#define LDLM_SET_FLAG(_l, _b) (((_l)->l_flags |= (_b))
+
+/** clear a ldlm_lock flag bit */
+#define LDLM_CLEAR_FLAG(_l, _b) (((_l)->l_flags &= ~(_b))
+
+/** Mask of flags inherited from parent lock when doing intents. */
+#define LDLM_INHERIT_FLAGS LDLM_FL_INHERIT_MASK
+
+/** Mask of Flags sent in AST lock_flags to map into the receiving lock. */
+#define LDLM_AST_FLAGS LDLM_FL_AST_MASK
+
+/** @} subgroup */
+/** @} group */
+#ifdef WIRESHARK_COMPILE
+static int hf_lustre_ldlm_fl_lock_changed = -1;
+static int hf_lustre_ldlm_fl_block_granted = -1;
+static int hf_lustre_ldlm_fl_block_conv = -1;
+static int hf_lustre_ldlm_fl_block_wait = -1;
+static int hf_lustre_ldlm_fl_ast_sent = -1;
+static int hf_lustre_ldlm_fl_replay = -1;
+static int hf_lustre_ldlm_fl_intent_only = -1;
+static int hf_lustre_ldlm_fl_has_intent = -1;
+static int hf_lustre_ldlm_fl_flock_deadlock = -1;
+static int hf_lustre_ldlm_fl_discard_data = -1;
+static int hf_lustre_ldlm_fl_no_timeout = -1;
+static int hf_lustre_ldlm_fl_block_nowait = -1;
+static int hf_lustre_ldlm_fl_test_lock = -1;
+static int hf_lustre_ldlm_fl_cancel_on_block = -1;
+static int hf_lustre_ldlm_fl_deny_on_contention = -1;
+static int hf_lustre_ldlm_fl_ast_discard_data = -1;
+static int hf_lustre_ldlm_fl_fail_loc = -1;
+static int hf_lustre_ldlm_fl_skipped = -1;
+static int hf_lustre_ldlm_fl_cbpending = -1;
+static int hf_lustre_ldlm_fl_wait_noreproc = -1;
+static int hf_lustre_ldlm_fl_cancel = -1;
+static int hf_lustre_ldlm_fl_local_only = -1;
+static int hf_lustre_ldlm_fl_failed = -1;
+static int hf_lustre_ldlm_fl_canceling = -1;
+static int hf_lustre_ldlm_fl_local = -1;
+static int hf_lustre_ldlm_fl_lvb_ready = -1;
+static int hf_lustre_ldlm_fl_kms_ignore = -1;
+static int hf_lustre_ldlm_fl_cp_reqd = -1;
+static int hf_lustre_ldlm_fl_cleaned = -1;
+static int hf_lustre_ldlm_fl_atomic_cb = -1;
+static int hf_lustre_ldlm_fl_bl_ast = -1;
+static int hf_lustre_ldlm_fl_bl_done = -1;
+static int hf_lustre_ldlm_fl_no_lru = -1;
+static int hf_lustre_ldlm_fl_fail_notified = -1;
+static int hf_lustre_ldlm_fl_destroyed = -1;
+static int hf_lustre_ldlm_fl_server_lock = -1;
+static int hf_lustre_ldlm_fl_res_locked = -1;
+static int hf_lustre_ldlm_fl_waited = -1;
+static int hf_lustre_ldlm_fl_ns_srv = -1;
+static int hf_lustre_ldlm_fl_excl = -1;
+
+const value_string lustre_ldlm_flags_vals[] = {
+ {LDLM_FL_LOCK_CHANGED, "LDLM_FL_LOCK_CHANGED"},
+ {LDLM_FL_BLOCK_GRANTED, "LDLM_FL_BLOCK_GRANTED"},
+ {LDLM_FL_BLOCK_CONV, "LDLM_FL_BLOCK_CONV"},
+ {LDLM_FL_BLOCK_WAIT, "LDLM_FL_BLOCK_WAIT"},
+ {LDLM_FL_AST_SENT, "LDLM_FL_AST_SENT"},
+ {LDLM_FL_REPLAY, "LDLM_FL_REPLAY"},
+ {LDLM_FL_INTENT_ONLY, "LDLM_FL_INTENT_ONLY"},
+ {LDLM_FL_HAS_INTENT, "LDLM_FL_HAS_INTENT"},
+ {LDLM_FL_FLOCK_DEADLOCK, "LDLM_FL_FLOCK_DEADLOCK"},
+ {LDLM_FL_DISCARD_DATA, "LDLM_FL_DISCARD_DATA"},
+ {LDLM_FL_NO_TIMEOUT, "LDLM_FL_NO_TIMEOUT"},
+ {LDLM_FL_BLOCK_NOWAIT, "LDLM_FL_BLOCK_NOWAIT"},
+ {LDLM_FL_TEST_LOCK, "LDLM_FL_TEST_LOCK"},
+ {LDLM_FL_CANCEL_ON_BLOCK, "LDLM_FL_CANCEL_ON_BLOCK"},
+ {LDLM_FL_DENY_ON_CONTENTION, "LDLM_FL_DENY_ON_CONTENTION"},
+ {LDLM_FL_AST_DISCARD_DATA, "LDLM_FL_AST_DISCARD_DATA"},
+ {LDLM_FL_FAIL_LOC, "LDLM_FL_FAIL_LOC"},
+ {LDLM_FL_SKIPPED, "LDLM_FL_SKIPPED"},
+ {LDLM_FL_CBPENDING, "LDLM_FL_CBPENDING"},
+ {LDLM_FL_WAIT_NOREPROC, "LDLM_FL_WAIT_NOREPROC"},
+ {LDLM_FL_CANCEL, "LDLM_FL_CANCEL"},
+ {LDLM_FL_LOCAL_ONLY, "LDLM_FL_LOCAL_ONLY"},
+ {LDLM_FL_FAILED, "LDLM_FL_FAILED"},
+ {LDLM_FL_CANCELING, "LDLM_FL_CANCELING"},
+ {LDLM_FL_LOCAL, "LDLM_FL_LOCAL"},
+ {LDLM_FL_LVB_READY, "LDLM_FL_LVB_READY"},
+ {LDLM_FL_KMS_IGNORE, "LDLM_FL_KMS_IGNORE"},
+ {LDLM_FL_CP_REQD, "LDLM_FL_CP_REQD"},
+ {LDLM_FL_CLEANED, "LDLM_FL_CLEANED"},
+ {LDLM_FL_ATOMIC_CB, "LDLM_FL_ATOMIC_CB"},
+ {LDLM_FL_BL_AST, "LDLM_FL_BL_AST"},
+ {LDLM_FL_BL_DONE, "LDLM_FL_BL_DONE"},
+ {LDLM_FL_NO_LRU, "LDLM_FL_NO_LRU"},
+ {LDLM_FL_FAIL_NOTIFIED, "LDLM_FL_FAIL_NOTIFIED"},
+ {LDLM_FL_DESTROYED, "LDLM_FL_DESTROYED"},
+ {LDLM_FL_SERVER_LOCK, "LDLM_FL_SERVER_LOCK"},
+ {LDLM_FL_RES_LOCKED, "LDLM_FL_RES_LOCKED"},
+ {LDLM_FL_WAITED, "LDLM_FL_WAITED"},
+ {LDLM_FL_NS_SRV, "LDLM_FL_NS_SRV"},
+ {LDLM_FL_EXCL, "LDLM_FL_EXCL"},
+ { 0, NULL }
+};
+#endif /* WIRESHARK_COMPILE */
+#endif /* LDLM_ALL_FLAGS_MASK */
diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h
new file mode 100644
index 000000000..0f8f76c43
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_eacl.h
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <linux/posix_acl_xattr.h>
+
+typedef struct {
+ __u16 e_tag;
+ __u16 e_perm;
+ __u32 e_id;
+ __u32 e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+ __u32 a_count;
+ ext_acl_xattr_entry a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+ (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+ (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+
+extern ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size);
+extern int
+lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, size_t size,
+ posix_acl_xattr_header **out);
+extern void
+lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size);
+extern void
+lustre_ext_acl_xattr_free(ext_acl_xattr_header *header);
+extern int
+lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header,
+ posix_acl_xattr_header **out);
+extern ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header);
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h
new file mode 100644
index 000000000..9c06a49f1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_export.h
@@ -0,0 +1,406 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include "lprocfs_status.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_dlm.h"
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+ /** Protects led_lcd below */
+ struct mutex ted_lcd_lock;
+ /** Per-client data for each export */
+ struct lsd_client_data *ted_lcd;
+ /** Offset of record in last_rcvd file */
+ loff_t ted_lr_off;
+ /** Client index in last_rcvd file */
+ int ted_lr_idx;
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+ struct tg_export_data med_ted;
+ /** List of all files opened by client on this MDT */
+ struct list_head med_open_head;
+ spinlock_t med_open_lock; /* med_open_head, mfd_list */
+ /** Bitmask of all ibit locks this MDT understands */
+ __u64 med_ibits_known;
+ struct mutex med_idmap_mutex;
+ struct lustre_idmap_table *med_idmap;
+};
+
+struct ec_export_data { /* echo client */
+ struct list_head eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+ struct tg_export_data fed_ted;
+ spinlock_t fed_lock; /**< protects fed_mod_list */
+ long fed_dirty; /* in bytes */
+ long fed_grant; /* in bytes */
+ struct list_head fed_mod_list; /* files being modified */
+ int fed_mod_count;/* items in fed_writing list */
+ long fed_pending; /* bytes just being written */
+ __u32 fed_group;
+ __u8 fed_pagesize; /* log2 of client page size */
+};
+
+struct mgs_export_data {
+ struct list_head med_clients; /* mgc fs client via this exp */
+ spinlock_t med_lock; /* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+ lnet_nid_t nid;
+ struct hlist_node nid_hash;
+ struct list_head nid_list;
+ struct obd_device *nid_obd;
+ struct proc_dir_entry *nid_proc;
+ struct lprocfs_stats *nid_stats;
+ struct lprocfs_stats *nid_ldlm_stats;
+ atomic_t nid_exp_ref_count; /* for obd_nid_stats_hash
+ exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat) \
+do { \
+ atomic_inc(&(nidstat)->nid_exp_ref_count); \
+} while (0)
+
+#define nidstat_putref(nidstat) \
+do { \
+ atomic_dec(&(nidstat)->nid_exp_ref_count); \
+ LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0, \
+ "stat %p nid_exp_ref_count < 0\n", nidstat); \
+} while (0)
+
+enum obd_option {
+ OBD_OPT_FORCE = 0x0001,
+ OBD_OPT_FAILOVER = 0x0002,
+ OBD_OPT_ABORT_RECOV = 0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+ /**
+ * Export handle, it's id is provided to client on connect
+ * Subsequent client RPCs contain this handle id to identify
+ * what export they are talking to.
+ */
+ struct portals_handle exp_handle;
+ atomic_t exp_refcount;
+ /**
+ * Set of counters below is to track where export references are
+ * kept. The exp_rpc_count is used for reconnect handling also,
+ * the cb_count and locks_count are for debug purposes only for now.
+ * The sum of them should be less than exp_refcount by 3
+ */
+ atomic_t exp_rpc_count; /* RPC references */
+ atomic_t exp_cb_count; /* Commit callback references */
+ /** Number of queued replay requests to be processes */
+ atomic_t exp_replay_count;
+ atomic_t exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ struct list_head exp_locks_list;
+ spinlock_t exp_locks_list_guard;
+#endif
+ /** UUID of client connected to this export */
+ struct obd_uuid exp_client_uuid;
+ /** To link all exports on an obd device */
+ struct list_head exp_obd_chain;
+ struct hlist_node exp_uuid_hash; /** uuid-export hash*/
+ struct hlist_node exp_nid_hash; /** nid-export hash */
+ /**
+ * All exports eligible for ping evictor are linked into a list
+ * through this field in "most time since last request on this export"
+ * order
+ * protected by obd_dev_lock
+ */
+ struct list_head exp_obd_chain_timed;
+ /** Obd device of this export */
+ struct obd_device *exp_obd;
+ /**
+ * "reverse" import to send requests (e.g. from ldlm) back to client
+ * exp_lock protect its change
+ */
+ struct obd_import *exp_imp_reverse;
+ struct nid_stat *exp_nid_stats;
+ struct lprocfs_stats *exp_md_stats;
+ /** Active connection */
+ struct ptlrpc_connection *exp_connection;
+ /** Connection count value from last successful reconnect rpc */
+ __u32 exp_conn_cnt;
+ /** Hash list of all ldlm locks granted on this export */
+ struct cfs_hash *exp_lock_hash;
+ /**
+ * Hash list for Posix lock deadlock detection, added with
+ * ldlm_lock::l_exp_flock_hash.
+ */
+ struct cfs_hash *exp_flock_hash;
+ struct list_head exp_outstanding_replies;
+ struct list_head exp_uncommitted_replies;
+ spinlock_t exp_uncommitted_replies_lock;
+ /** Last committed transno for this export */
+ __u64 exp_last_committed;
+ /** When was last request received */
+ unsigned long exp_last_request_time;
+ /** On replay all requests waiting for replay are linked here */
+ struct list_head exp_req_replay_queue;
+ /**
+ * protects exp_flags, exp_outstanding_replies and the change
+ * of exp_imp_reverse
+ */
+ spinlock_t exp_lock;
+ /** Compatibility flags for this export are embedded into
+ * exp_connect_data */
+ struct obd_connect_data exp_connect_data;
+ enum obd_option exp_flags;
+ unsigned long exp_failed:1,
+ exp_in_recovery:1,
+ exp_disconnected:1,
+ exp_connecting:1,
+ /** VBR: export missed recovery */
+ exp_delayed:1,
+ /** VBR: failed version checking */
+ exp_vbr_failed:1,
+ exp_req_replay_needed:1,
+ exp_lock_replay_needed:1,
+ exp_need_sync:1,
+ exp_flvr_changed:1,
+ exp_flvr_adapt:1,
+ exp_libclient:1, /* liblustre client? */
+ /* client timed out and tried to reconnect,
+ * but couldn't because of active rpcs */
+ exp_abort_active_req:1,
+ /* if to swap nidtbl entries for 2.2 clients.
+ * Only used by the MGS to fix LU-1644. */
+ exp_need_mne_swab:1;
+ /* also protected by exp_lock */
+ enum lustre_sec_part exp_sp_peer;
+ struct sptlrpc_flavor exp_flvr; /* current */
+ struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */
+ unsigned long exp_flvr_expire[2]; /* seconds */
+
+ /** protects exp_hp_rpcs */
+ spinlock_t exp_rpc_lock;
+ struct list_head exp_hp_rpcs; /* (potential) HP RPCs */
+
+ /** blocking dlm lock list, protected by exp_bl_list_lock */
+ struct list_head exp_bl_list;
+ spinlock_t exp_bl_list_lock;
+
+ /** Target specific data */
+ union {
+ struct tg_export_data eu_target_data;
+ struct mdt_export_data eu_mdt_data;
+ struct filter_export_data eu_filter_data;
+ struct ec_export_data eu_ec_data;
+ struct mgs_export_data eu_mgs_data;
+ } u;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+ return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+ return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+ return exp->exp_connect_data.ocd_brw_size;
+
+ return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+ return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, long age)
+{
+ LASSERT(exp->exp_delayed);
+ return time_before(cfs_time_add(exp->exp_last_request_time, age),
+ get_seconds());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_rmtclient(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int client_is_remote(struct obd_export *exp)
+{
+ struct obd_import *imp = class_exp2cliimp(exp);
+
+ return !!(imp->imp_connect_data.ocd_connect_flags &
+ OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ LASSERT(exp->exp_connection);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_som(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+ struct obd_connect_data *ocd;
+
+ LASSERT(imp != NULL);
+ ocd = &imp->imp_connect_data;
+ return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+ return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+ return true;
+ else
+ return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+ struct obd_connect_data *ocd;
+
+ LASSERT(imp != NULL);
+ ocd = &imp->imp_connect_data;
+ if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+ return true;
+ else
+ return false;
+}
+
+static inline __u64 exp_connect_ibits(struct obd_export *exp)
+{
+ struct obd_connect_data *ocd;
+
+ ocd = &exp->exp_connect_data;
+ return ocd->ocd_ibits_known;
+}
+
+static inline bool imp_connect_disp_stripe(struct obd_import *imp)
+{
+ struct obd_connect_data *ocd;
+
+ LASSERT(imp != NULL);
+ ocd = &imp->imp_connect_data;
+ return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE;
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h
new file mode 100644
index 000000000..0a0929fd9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fid.h
@@ -0,0 +1,767 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LUSTRE_FID_H
+#define __LUSTRE_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ * File IDentifier generated by client from range allocated by the SEQuence
+ * service and stored in struct lu_fid. The FID is composed of three parts:
+ * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem
+ * unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ * for system use. The OID component is a 32-bit value generated by the
+ * client on a per-SEQ basis to allow creating many unique FIDs without
+ * communication with the server. The VER component is a 32-bit value that
+ * distinguishes between different FID instantiations, such as snapshots or
+ * separate subtrees within the filesystem. FIDs with the same VER field
+ * are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ * OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ * Inode and Generation In FID, a surrogate FID used to globally identify
+ * an existing object on OLD formatted MDT file system. This would only be
+ * used on MDT0 in a DNE filesystem, because there cannot be more than one
+ * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ * range, where inode number is stored in SEQ, and inode generation is in OID.
+ * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ * which is the maximum possible for an ldiskfs backend. It also assumes
+ * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ * to clients, which has always been true.
+ *
+ * IDIF
+ * object ID In FID, a surrogate FID used to globally identify an existing
+ * OST object on OLD formatted OST file system. Belongs to a sequence in
+ * [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ * be identified in the FLD correctly. The OID field is calculated as:
+ *
+ * objid & 0xffffffff
+ *
+ * that is, it consists of lower 32 bits of object ID. For objects within
+ * the IDIF range, object ID extraction will be:
+ *
+ * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ * o_seq = 0; // formerly group number
+ *
+ * NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ * on any OST, and that no more than 65535 OSTs are in use. Both are very
+ * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ * a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ * or combinations thereof.
+ *
+ * OST_MDT0
+ * Surrogate FID used to identify an existing object on OLD formatted OST
+ * filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ * the introduction of FID-on-OST, at which point IDIF will be used to
+ * identify objects as residing on a specific OST.
+ *
+ * LLOG
+ * For Lustre Log objects the object sequence 1 is used. This is compatible
+ * with both OLD and NEW namespaces, as this SEQ number is in the
+ * ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ * sequence numbers.
+ *
+ * ECHO
+ * For testing OST IO performance the object sequence 2 is used. This is
+ * compatible with both OLD and NEW namespaces, as this SEQ number is in
+ * the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ * sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ * For testing with multiple MDTs the object sequence 3 through 9 is used,
+ * allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ * mappings. However, this SEQ range is only for testing prior to any
+ * production DNE release, as the objects in this range conflict across all
+ * OSTs, as the OST index is not part of the FID. For production DNE usage,
+ * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ * For compatibility with existing OLD OST network protocol structures, the
+ * FID must map onto the o_id and o_seq in a manner that ensures existing
+ * objects are identified consistently for IO, as well as onto the LDLM
+ * namespace to ensure IDIFs there is only a single resource name for any
+ * object in the DLM. The OLD OST object DLM resource mapping is:
+ *
+ * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ * The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ * resource[] = {SEQ, OID, VER, HASH};
+ *
+ * NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ * for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ * in all production releases the OLD o_seq field is always zero, and all
+ * valid FID OID values are non-zero, so the lock resources will not collide.
+ * Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+
+struct lu_env;
+struct lu_site;
+struct lu_context;
+struct obd_device;
+struct obd_export;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+
+enum {
+ /*
+ * This is how may metadata FIDs may be allocated in one sequence(128k)
+ */
+ LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+ /*
+ * This is how many data FIDs could be allocated in one sequence(4B - 1)
+ */
+ LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+ /*
+ * How many sequences to allocate to a client at once.
+ */
+ LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+ /*
+ * seq allocation pool size.
+ */
+ LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+ /*
+ * This is how many sequences may be in one super-sequence allocated to
+ * MDTs.
+ */
+ LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+ /** 2^6 FIDs for OI containers */
+ OSD_OI_FID_OID_BITS = 6,
+ /** reserve enough FIDs in case we want more in the future */
+ OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+ /** \see fld_mod_init */
+ FLD_INDEX_OID = 3UL,
+ /** \see fid_mod_init */
+ FID_SEQ_CTL_OID = 4UL,
+ FID_SEQ_SRV_OID = 5UL,
+ /** \see mdd_mod_init */
+ MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */
+ MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */
+ MDD_LOV_OBJ_OID = 8UL,
+ MDD_CAPA_KEYS_OID = 9UL,
+ /** \see mdt_mod_init */
+ LAST_RECV_OID = 11UL,
+ OSD_FS_ROOT_OID = 13UL,
+ ACCT_USER_OID = 15UL,
+ ACCT_GROUP_OID = 16UL,
+ LFSCK_BOOKMARK_OID = 17UL,
+ OTABLE_IT_OID = 18UL,
+ /* These two definitions are obsolete
+ * OFD_GROUP0_LAST_OID = 20UL,
+ * OFD_GROUP4K_LAST_OID = 20UL+4096,
+ */
+ OFD_LAST_GROUP_OID = 4117UL,
+ LLOG_CATALOGS_OID = 4118UL,
+ MGS_CONFIGS_OID = 4119UL,
+ OFD_HEALTH_CHECK_OID = 4120UL,
+ MDD_LOV_OBJ_OSEQ = 4121UL,
+ LFSCK_NAMESPACE_OID = 4122UL,
+ REMOTE_PARENT_DIR_OID = 4123UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+ fid->f_seq = FID_SEQ_LOCAL_FILE;
+ fid->f_oid = oid;
+ fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+ fid->f_seq = FID_SEQ_LOCAL_NAME;
+ fid->f_oid = oid;
+ fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+ return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+ fid_oid(fid) == 1));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+ return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+ fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+ return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+ fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+ return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+ fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+ return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+ (fid_oid(fid) == ACCT_USER_OID ||
+ fid_oid(fid) == ACCT_GROUP_OID);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+ return fid_seq(fid) == FID_SEQ_QUOTA ||
+ fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+ const __u64 seq = fid_seq(fid);
+
+ /* Here, we cannot distinguish whether the normal FID is for OST
+ * object or not. It is caller's duty to check more if needed. */
+ return (!fid_is_last_id(fid) &&
+ (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+ fid_is_root(fid) || fid_is_dot_lustre(fid);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+ return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+ fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq)
+{
+ if (fid_seq_is_mdt0(seq)) {
+ fid->f_seq = fid_idif_seq(0, 0);
+ } else {
+ LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+ fid_seq_is_idif(seq), "%#llx\n", seq);
+ fid->f_seq = seq;
+ }
+ fid->f_oid = 0;
+ fid->f_ver = 0;
+}
+
+/* seq client type */
+enum lu_cli_type {
+ LUSTRE_SEQ_METADATA = 1,
+ LUSTRE_SEQ_DATA
+};
+
+enum lu_mgr_type {
+ LUSTRE_SEQ_SERVER,
+ LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+ /* Sequence-controller export. */
+ struct obd_export *lcs_exp;
+ struct mutex lcs_mutex;
+
+ /*
+ * Range of allowed for allocation sequences. When using lu_client_seq on
+ * clients, this contains meta-sequence range. And for servers this
+ * contains super-sequence range.
+ */
+ struct lu_seq_range lcs_space;
+
+ /* Seq related proc */
+ struct proc_dir_entry *lcs_proc_dir;
+
+ /* This holds last allocated fid in last obtained seq */
+ struct lu_fid lcs_fid;
+
+ /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+ enum lu_cli_type lcs_type;
+
+ /*
+ * Service uuid, passed from MDT + seq name to form unique seq name to
+ * use it with procfs.
+ */
+ char lcs_name[LUSTRE_MDT_MAXNAMELEN];
+
+ /*
+ * Sequence width, that is how many objects may be allocated in one
+ * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+ */
+ __u64 lcs_width;
+
+ /* Seq-server for direct talking */
+ struct lu_server_seq *lcs_srv;
+
+ /* wait queue for fid allocation and update indicator */
+ wait_queue_head_t lcs_waitq;
+ int lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+ /* Available sequences space */
+ struct lu_seq_range lss_space;
+
+ /* keeps highwater in lsr_end for seq allocation algorithm */
+ struct lu_seq_range lss_lowater_set;
+ struct lu_seq_range lss_hiwater_set;
+
+ /*
+ * Device for server side seq manager needs (saving sequences to backing
+ * store).
+ */
+ struct dt_device *lss_dev;
+
+ /* /seq file object device */
+ struct dt_object *lss_obj;
+
+ /* Seq related proc */
+ struct proc_dir_entry *lss_proc_dir;
+
+ /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+ enum lu_mgr_type lss_type;
+
+ /* Client interface to request controller */
+ struct lu_client_seq *lss_cli;
+
+ /* Mutex for protecting allocation */
+ struct mutex lss_mutex;
+
+ /*
+ * Service uuid, passed from MDT + seq name to form unique seq name to
+ * use it with procfs.
+ */
+ char lss_name[LUSTRE_MDT_MAXNAMELEN];
+
+ /*
+ * Allocation chunks for super and meta sequences. Default values are
+ * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+ */
+ __u64 lss_width;
+
+ /*
+ * minimum lss_alloc_set size that should be allocated from
+ * lss_space
+ */
+ __u64 lss_set_width;
+
+ /* sync is needed for update operation */
+ __u32 lss_need_sync;
+
+ /**
+ * Pointer to site object, required to access site fld.
+ */
+ struct seq_server_site *lss_site;
+};
+
+/* Server methods */
+
+int seq_server_init(struct lu_server_seq *seq,
+ struct dt_device *dev,
+ const char *prefix,
+ enum lu_mgr_type type,
+ struct seq_server_site *ss,
+ const struct lu_env *env);
+
+void seq_server_fini(struct lu_server_seq *seq,
+ const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+ struct lu_seq_range *out,
+ const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+ struct lu_seq_range *out,
+ const struct lu_env *env);
+
+int seq_server_set_cli(struct lu_server_seq *seq,
+ struct lu_client_seq *cli,
+ const struct lu_env *env);
+
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+ struct obd_export *exp,
+ enum lu_cli_type type,
+ const char *prefix,
+ struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+ struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+ u64 *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+ struct lu_site *site, const struct lu_fid *fid);
+
+enum lu_cli_type;
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+ enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res)
+{
+ memset(res, 0, sizeof(*res));
+ res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid);
+ res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid);
+
+ return res;
+}
+
+/*
+ * Return true if resource is for object identified by FID.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *fid,
+ const struct ldlm_res_id *res)
+{
+ return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) &&
+ res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid);
+}
+
+/*
+ * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name().
+ */
+static inline struct lu_fid *
+fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res)
+{
+ fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+ fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]);
+ fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+ LASSERT(fid_res_name_eq(fid, res));
+
+ return fid;
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline struct ldlm_res_id *
+fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid,
+ struct ldlm_res_id *res)
+{
+ fid_build_reg_res_name(glb_fid, res);
+ res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+ res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+
+ return res;
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid,
+ union lquota_id *qid,
+ const struct ldlm_res_id *res)
+{
+ fid_extract_from_res_name(glb_fid, res);
+ qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+ qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+ qid->qid_fid.f_ver =
+ (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+static inline struct ldlm_res_id *
+fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash,
+ struct ldlm_res_id *res)
+{
+ fid_build_reg_res_name(fid, res);
+ res->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+
+ return res;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just opposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unify the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ * res[0] = objid, res[1] = 0, still keep the original order,
+ * for compatibility.
+ *
+ * For new resid
+ * res will be built from normal FID directly, i.e. res[0] = f_seq,
+ * res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(struct ost_id *oi,
+ struct ldlm_res_id *name)
+{
+ memset(name, 0, sizeof(*name));
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+ } else {
+ fid_build_reg_res_name(&oi->oi_fid, name);
+ }
+}
+
+static inline void ostid_res_name_to_id(struct ost_id *oi,
+ struct ldlm_res_id *name)
+{
+ if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+ /* old resid */
+ ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+ ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+ } else {
+ /* new resid */
+ fid_extract_from_res_name(&oi->oi_fid, name);
+ }
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int ostid_res_name_eq(struct ost_id *oi,
+ struct ldlm_res_id *name)
+{
+ /* Note: it is just a trick here to save some effort, probably the
+ * correct way would be turn them into the FID and compare */
+ if (fid_seq_is_mdt0(ostid_seq(oi))) {
+ return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+ } else {
+ return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+ name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+ }
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+ struct ldlm_res_id *resname)
+{
+ if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+ struct ost_id oi;
+ oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */
+ if (fid_to_ostid(fid, &oi) != 0)
+ return;
+ ostid_build_res_name(&oi, resname);
+ } else {
+ fid_build_reg_res_name(fid, resname);
+ }
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+ const struct ldlm_res_id *name)
+{
+ if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+ /* old resid */
+ struct ost_id oi;
+ ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+ ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+ ostid_to_fid(fid, &oi, 0);
+ } else {
+ /* new resid */
+ fid_extract_from_res_name(fid, name);
+ }
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+ __u64 ino;
+ __u64 seq;
+
+ if (fid_is_igif(fid)) {
+ ino = lu_igif_ino(fid);
+ return ino;
+ }
+
+ seq = fid_seq(fid);
+
+ ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+ return ino ? ino : fid_oid(fid);
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+ /* all objects with same id and different versions will belong to same
+ * collisions list. */
+ return hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+ __u32 ino;
+ __u64 seq;
+
+ if (fid_is_igif(fid)) {
+ ino = lu_igif_ino(fid);
+ return ino;
+ }
+
+ seq = fid_seq(fid) - FID_SEQ_START;
+
+ /* Map the high bits of the OID into higher bits of the inode number so
+ * that inodes generated at about the same time have a reduced chance
+ * of collisions. This will give a period of 2^12 = 1024 unique clients
+ * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+ * (from OID), or up to 128M inodes without collisions for new files. */
+ ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+ (seq >> (64 - (40-8)) & 0xffffff00) +
+ (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+ return ino ? ino : fid_oid(fid);
+}
+
+static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2)
+{
+ LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+ PFID(fid1), PFID(fid2));
+
+ if (fid_is_idif(fid1) && fid_is_idif(fid2))
+ return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+ fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+ return fid_oid(fid1) - fid_oid(fid2);
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = cpu_to_le64(src->lsr_start);
+ dst->lsr_end = cpu_to_le64(src->lsr_end);
+ dst->lsr_index = cpu_to_le32(src->lsr_index);
+ dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = le64_to_cpu(src->lsr_start);
+ dst->lsr_end = le64_to_cpu(src->lsr_end);
+ dst->lsr_index = le32_to_cpu(src->lsr_index);
+ dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = cpu_to_be64(src->lsr_start);
+ dst->lsr_end = cpu_to_be64(src->lsr_end);
+ dst->lsr_index = cpu_to_be32(src->lsr_index);
+ dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+ dst->lsr_start = be64_to_cpu(src->lsr_start);
+ dst->lsr_end = be64_to_cpu(src->lsr_end);
+ dst->lsr_index = be32_to_cpu(src->lsr_index);
+ dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+/** @} fid */
+
+#endif /* __LUSTRE_FID_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h
new file mode 100644
index 000000000..5ee4b1ed0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_fld.h
@@ -0,0 +1,160 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include "lustre/lustre_idl.h"
+#include "../../include/linux/libcfs/libcfs.h"
+
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+
+extern const struct dt_index_features fld_index_features;
+extern const char fld_index_name[];
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+ LUSTRE_CLI_FLD_HASH_DHT = 0,
+ LUSTRE_CLI_FLD_HASH_RRB
+};
+
+
+struct lu_fld_target {
+ struct list_head ft_chain;
+ struct obd_export *ft_exp;
+ struct lu_server_fld *ft_srv;
+ __u64 ft_idx;
+};
+
+struct lu_server_fld {
+ /**
+ * Fld dir proc entry. */
+ struct proc_dir_entry *lsf_proc_dir;
+
+ /**
+ * /fld file object device */
+ struct dt_object *lsf_obj;
+
+ /**
+ * super sequence controller export, needed to forward fld
+ * lookup request. */
+ struct obd_export *lsf_control_exp;
+
+ /**
+ * Client FLD cache. */
+ struct fld_cache *lsf_cache;
+
+ /**
+ * Protect index modifications */
+ struct mutex lsf_lock;
+
+ /**
+ * Fld service name in form "fld-srv-lustre-MDTXXX" */
+ char lsf_name[LUSTRE_MDT_MAXNAMELEN];
+
+};
+
+struct lu_client_fld {
+ /**
+ * Client side proc entry. */
+ struct proc_dir_entry *lcf_proc_dir;
+
+ /**
+ * List of exports client FLD knows about. */
+ struct list_head lcf_targets;
+
+ /**
+ * Current hash to be used to chose an export. */
+ struct lu_fld_hash *lcf_hash;
+
+ /**
+ * Exports count. */
+ int lcf_count;
+
+ /**
+ * Lock protecting exports list and fld_hash. */
+ spinlock_t lcf_lock;
+
+ /**
+ * Client FLD cache. */
+ struct fld_cache *lcf_cache;
+
+ /**
+ * Client fld proc entry name. */
+ char lcf_name[LUSTRE_MDT_MAXNAMELEN];
+
+ int lcf_flags;
+};
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+ const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds,
+ __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+ struct lu_seq_range *range,
+ const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld, u64 seq,
+ const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+ struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+ __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h
new file mode 100644
index 000000000..f3ae02b3e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ha.h
@@ -0,0 +1,64 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+
+/** @} ha */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h
new file mode 100644
index 000000000..726bbd3ea
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_handles.h
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+
+struct portals_handle_ops {
+ void (*hop_addref)(void *object);
+ void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for. ie:
+ *
+ * struct ldlm_lock {
+ * struct portals_handle handle;
+ * ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock. If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+ struct list_head h_link;
+ __u64 h_cookie;
+ struct portals_handle_ops *h_ops;
+
+ /* newly added fields to handle the RCU issue. -jxiong */
+ struct rcu_head h_rcu;
+ spinlock_t h_lock;
+ unsigned int h_size:31;
+ unsigned int h_in:1;
+};
+#define RCU2HANDLE(rcu) container_of(rcu, struct portals_handle, h_rcu)
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+ struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+void class_handle_free_cb(struct rcu_head *rcu);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h
new file mode 100644
index 000000000..dcc807676
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_import.h
@@ -0,0 +1,385 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include "lustre_handles.h"
+#include "lustre/lustre_idl.h"
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4 /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1 /* use last reported value only */
+
+struct adaptive_timeout {
+ time_t at_binstart; /* bin start time */
+ unsigned int at_hist[AT_BINS]; /* timeout history bins */
+ unsigned int at_flags;
+ unsigned int at_current; /* current timeout value */
+ unsigned int at_worst_ever; /* worst-ever timeout value */
+ time_t at_worst_time; /* worst-ever timeout timestamp */
+ spinlock_t at_lock;
+};
+
+struct ptlrpc_at_array {
+ struct list_head *paa_reqs_array; /** array to hold requests */
+ __u32 paa_size; /** the size of array */
+ __u32 paa_count; /** the total count of reqs */
+ time_t paa_deadline; /** the earliest deadline of reqs */
+ __u32 *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+ int iat_portal[IMP_AT_MAX_PORTALS];
+ struct adaptive_timeout iat_net_latency;
+ struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+ LUSTRE_IMP_CLOSED = 1,
+ LUSTRE_IMP_NEW = 2,
+ LUSTRE_IMP_DISCON = 3,
+ LUSTRE_IMP_CONNECTING = 4,
+ LUSTRE_IMP_REPLAY = 5,
+ LUSTRE_IMP_REPLAY_LOCKS = 6,
+ LUSTRE_IMP_REPLAY_WAIT = 7,
+ LUSTRE_IMP_RECOVER = 8,
+ LUSTRE_IMP_FULL = 9,
+ LUSTRE_IMP_EVICTED = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char *ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+ static char *import_state_names[] = {
+ "<UNKNOWN>", "CLOSED", "NEW", "DISCONN",
+ "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+ "RECOVER", "FULL", "EVICTED",
+ };
+
+ LASSERT (state <= LUSTRE_IMP_EVICTED);
+ return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+ IMP_EVENT_DISCON = 0x808001,
+ IMP_EVENT_INACTIVE = 0x808002,
+ IMP_EVENT_INVALIDATE = 0x808003,
+ IMP_EVENT_ACTIVE = 0x808004,
+ IMP_EVENT_OCD = 0x808005,
+ IMP_EVENT_DEACTIVATE = 0x808006,
+ IMP_EVENT_ACTIVATE = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+ /** Item for linking connections together */
+ struct list_head oic_item;
+ /** Pointer to actual PortalRPC connection */
+ struct ptlrpc_connection *oic_conn;
+ /** uuid of remote side */
+ struct obd_uuid oic_uuid;
+ /**
+ * Time (64 bit jiffies) of last connection attempt on this connection
+ */
+ __u64 oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+ enum lustre_imp_state ish_state;
+ time_t ish_time;
+};
+
+/**
+ * Definition of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+ /** Local handle (== id) for this import. */
+ struct portals_handle imp_handle;
+ /** Reference counter */
+ atomic_t imp_refcount;
+ struct lustre_handle imp_dlm_handle; /* client's ldlm export */
+ /** Currently active connection */
+ struct ptlrpc_connection *imp_connection;
+ /** PortalRPC client structure for this import */
+ struct ptlrpc_client *imp_client;
+ /** List element for linking into pinger chain */
+ struct list_head imp_pinger_chain;
+ /** List element for linking into chain for destruction */
+ struct list_head imp_zombie_chain;
+
+ /**
+ * Lists of requests that are retained for replay, waiting for a reply,
+ * or waiting for recovery to complete, respectively.
+ * @{
+ */
+ struct list_head imp_replay_list;
+ struct list_head imp_sending_list;
+ struct list_head imp_delayed_list;
+ /** @} */
+
+ /**
+ * List of requests that are retained for committed open replay. Once
+ * open is committed, open replay request will be moved from the
+ * imp_replay_list into the imp_committed_list.
+ * The imp_replay_cursor is for accelerating searching during replay.
+ * @{
+ */
+ struct list_head imp_committed_list;
+ struct list_head *imp_replay_cursor;
+ /** @} */
+
+ /** obd device for this import */
+ struct obd_device *imp_obd;
+
+ /**
+ * some seciruty-related fields
+ * @{
+ */
+ struct ptlrpc_sec *imp_sec;
+ struct mutex imp_sec_mutex;
+ unsigned long imp_sec_expire;
+ /** @} */
+
+ /** Wait queue for those who need to wait for recovery completion */
+ wait_queue_head_t imp_recovery_waitq;
+
+ /** Number of requests currently in-flight */
+ atomic_t imp_inflight;
+ /** Number of requests currently unregistering */
+ atomic_t imp_unregistering;
+ /** Number of replay requests inflight */
+ atomic_t imp_replay_inflight;
+ /** Number of currently happening import invalidations */
+ atomic_t imp_inval_count;
+ /** Numbner of request timeouts */
+ atomic_t imp_timeouts;
+ /** Current import state */
+ enum lustre_imp_state imp_state;
+ /** Last replay state */
+ enum lustre_imp_state imp_replay_state;
+ /** History of import states */
+ struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN];
+ int imp_state_hist_idx;
+ /** Current import generation. Incremented on every reconnect */
+ int imp_generation;
+ /** Incremented every time we send reconnection request */
+ __u32 imp_conn_cnt;
+ /**
+ * \see ptlrpc_free_committed remembers imp_generation value here
+ * after a check to save on unnecessary replay list iterations
+ */
+ int imp_last_generation_checked;
+ /** Last transno we replayed */
+ __u64 imp_last_replay_transno;
+ /** Last transno committed on remote side */
+ __u64 imp_peer_committed_transno;
+ /**
+ * \see ptlrpc_free_committed remembers last_transno since its last
+ * check here and if last_transno did not change since last run of
+ * ptlrpc_free_committed and import generation is the same, we can
+ * skip looking for requests to remove from replay list as optimisation
+ */
+ __u64 imp_last_transno_checked;
+ /**
+ * Remote export handle. This is how remote side knows what export
+ * we are talking to. Filled from response to connect request
+ */
+ struct lustre_handle imp_remote_handle;
+ /** When to perform next ping. time in jiffies. */
+ unsigned long imp_next_ping;
+ /** When we last successfully connected. time in 64bit jiffies */
+ __u64 imp_last_success_conn;
+
+ /** List of all possible connection for import. */
+ struct list_head imp_conn_list;
+ /**
+ * Current connection. \a imp_connection is imp_conn_current->oic_conn
+ */
+ struct obd_import_conn *imp_conn_current;
+
+ /** Protects flags, level, generation, conn_cnt, *_list */
+ spinlock_t imp_lock;
+
+ /* flags */
+ unsigned long imp_no_timeout:1, /* timeouts are disabled */
+ imp_invalid:1, /* evicted */
+ /* administratively disabled */
+ imp_deactive:1,
+ /* try to recover the import */
+ imp_replayable:1,
+ /* don't run recovery (timeout instead) */
+ imp_dlm_fake:1,
+ /* use 1/2 timeout on MDS' OSCs */
+ imp_server_timeout:1,
+ /* VBR: imp in delayed recovery */
+ imp_delayed_recovery:1,
+ /* VBR: if gap was found then no lock replays
+ */
+ imp_no_lock_replay:1,
+ /* recovery by versions was failed */
+ imp_vbr_failed:1,
+ /* force an immediate ping */
+ imp_force_verify:1,
+ /* force a scheduled ping */
+ imp_force_next_verify:1,
+ /* pingable */
+ imp_pingable:1,
+ /* resend for replay */
+ imp_resend_replay:1,
+ /* disable normal recovery, for test only. */
+ imp_no_pinger_recover:1,
+ /* need IR MNE swab */
+ imp_need_mne_swab:1,
+ /* import must be reconnected instead of
+ * chose new connection */
+ imp_force_reconnect:1,
+ /* import has tried to connect with server */
+ imp_connect_tried:1;
+ __u32 imp_connect_op;
+ struct obd_connect_data imp_connect_data;
+ __u64 imp_connect_flags_orig;
+ int imp_connect_error;
+
+ __u32 imp_msg_magic;
+ __u32 imp_msghdr_flags; /* adjusted based on server capability */
+
+ struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
+
+ struct imp_at imp_at; /* adaptive timeout data */
+ time_t imp_last_reply_time; /* for health check */
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+ int event, void *event_arg, void *cb_data);
+
+/**
+ * Structure for import observer.
+ * It is possible to register "observer" on an import and every time
+ * something happens to an import (like connect/evict/disconnect)
+ * obderver will get its callback called with event type
+ */
+struct obd_import_observer {
+ struct list_head oio_chain;
+ obd_import_callback oio_cb;
+ void *oio_cb_data;
+};
+
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+ void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+ void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+ void *event_arg);
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+ /* add an arbitrary minimum: 125% +5 sec */
+ return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+ /* restore estimate value from timeout: e=4/5(t-5) */
+ LASSERT(val);
+ return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val)
+{
+ spin_lock(&at->at_lock);
+ at->at_current = val;
+ at->at_worst_ever = val;
+ at->at_worst_time = get_seconds();
+ spin_unlock(&at->at_lock);
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags)
+{
+ memset(at, 0, sizeof(*at));
+ spin_lock_init(&at->at_lock);
+ at->at_flags = flags;
+ at_reset(at, val);
+}
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at)
+{
+ return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/drivers/staging/lustre/lustre/include/lustre_intent.h b/drivers/staging/lustre/lustre/include/lustre_intent.h
new file mode 100644
index 000000000..c491d52d8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_intent.h
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+struct lustre_intent_data {
+ int it_disposition;
+ int it_status;
+ __u64 it_lock_handle;
+ __u64 it_lock_bits;
+ int it_lock_mode;
+ int it_remote_lock_mode;
+ __u64 it_remote_lock_handle;
+ void *it_data;
+ unsigned int it_lock_set:1;
+};
+
+struct lookup_intent {
+ int it_op;
+ int it_create_mode;
+ __u64 it_flags;
+ union {
+ struct lustre_intent_data lustre;
+ } d;
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
new file mode 100644
index 000000000..bf135630c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -0,0 +1,666 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/types.h>
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_ver.h"
+#include "lustre_cfg.h"
+
+/* target.c */
+struct kstatfs;
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include "lustre_ha.h"
+#include "lustre_net.h"
+
+#define LI_POISON 0x5a5a5a5a
+#if BITS_PER_LONG > 32
+# define LL_POISON 0x5a5a5a5a5a5a5a5aL
+#else
+# define LL_POISON 0x5a5a5a5aL
+#endif
+#define LP_POISON ((void *)LL_POISON)
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+ int opcode, int version,
+ u32 keylen, void *key,
+ u32 vallen, void *val,
+ struct ptlrpc_request_set *set);
+
+#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
+#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/* client.c */
+
+int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+struct client_obd *client_conn2cli(struct lustre_handle *conn);
+
+struct md_open_data;
+struct obd_client_handle {
+ struct lustre_handle och_fh;
+ struct lu_fid och_fid;
+ struct md_open_data *och_mod;
+ struct lustre_handle och_lease_handle; /* open lock for lease */
+ __u32 och_magic;
+ fmode_t och_flags;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+/* statfs_pack.c */
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/*
+ * For md echo client
+ */
+enum md_echo_cmd {
+ ECHO_MD_CREATE = 1, /* Open/Create file on MDT */
+ ECHO_MD_MKDIR = 2, /* Mkdir on MDT */
+ ECHO_MD_DESTROY = 3, /* Unlink file on MDT */
+ ECHO_MD_RMDIR = 4, /* Rmdir on MDT */
+ ECHO_MD_LOOKUP = 5, /* Lookup on MDT */
+ ECHO_MD_GETATTR = 6, /* Getattr on MDT */
+ ECHO_MD_SETATTR = 7, /* Setattr on MDT */
+ ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */
+};
+
+/*
+ * OBD IOCTLS
+ */
+#define OBD_IOCTL_VERSION 0x00010004
+
+struct obd_ioctl_data {
+ __u32 ioc_len;
+ __u32 ioc_version;
+
+ union {
+ __u64 ioc_cookie;
+ __u64 ioc_u64_1;
+ };
+ union {
+ __u32 ioc_conn1;
+ __u32 ioc_u32_1;
+ };
+ union {
+ __u32 ioc_conn2;
+ __u32 ioc_u32_2;
+ };
+
+ struct obdo ioc_obdo1;
+ struct obdo ioc_obdo2;
+
+ u64 ioc_count;
+ u64 ioc_offset;
+ __u32 ioc_dev;
+ __u32 ioc_command;
+
+ __u64 ioc_nid;
+ __u32 ioc_nal;
+ __u32 ioc_type;
+
+ /* buffers the kernel will treat as user pointers */
+ __u32 ioc_plen1;
+ char *ioc_pbuf1;
+ __u32 ioc_plen2;
+ char *ioc_pbuf2;
+
+ /* inline buffers for various arguments */
+ __u32 ioc_inllen1;
+ char *ioc_inlbuf1;
+ __u32 ioc_inllen2;
+ char *ioc_inlbuf2;
+ __u32 ioc_inllen3;
+ char *ioc_inlbuf3;
+ __u32 ioc_inllen4;
+ char *ioc_inlbuf4;
+
+ char ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+ __u32 ioc_len;
+ __u32 ioc_version;
+};
+
+static inline int obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+ int len = cfs_size_round(sizeof(struct obd_ioctl_data));
+ len += cfs_size_round(data->ioc_inllen1);
+ len += cfs_size_round(data->ioc_inllen2);
+ len += cfs_size_round(data->ioc_inllen3);
+ len += cfs_size_round(data->ioc_inllen4);
+ return len;
+}
+
+
+static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+ if (data->ioc_len > OBD_MAX_IOCTL_BUFFER) {
+ CERROR("OBD ioctl: ioc_len larger than %d\n",
+ OBD_MAX_IOCTL_BUFFER);
+ return 1;
+ }
+ if (data->ioc_inllen1 > OBD_MAX_IOCTL_BUFFER) {
+ CERROR("OBD ioctl: ioc_inllen1 larger than ioc_len\n");
+ return 1;
+ }
+ if (data->ioc_inllen2 > OBD_MAX_IOCTL_BUFFER) {
+ CERROR("OBD ioctl: ioc_inllen2 larger than ioc_len\n");
+ return 1;
+ }
+ if (data->ioc_inllen3 > OBD_MAX_IOCTL_BUFFER) {
+ CERROR("OBD ioctl: ioc_inllen3 larger than ioc_len\n");
+ return 1;
+ }
+ if (data->ioc_inllen4 > OBD_MAX_IOCTL_BUFFER) {
+ CERROR("OBD ioctl: ioc_inllen4 larger than ioc_len\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+ CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+ CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf3 && !data->ioc_inllen3) {
+ CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_inlbuf4 && !data->ioc_inllen4) {
+ CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_pbuf1 && !data->ioc_plen1) {
+ CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_pbuf2 && !data->ioc_plen2) {
+ CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+ return 1;
+ }
+ if (data->ioc_plen1 && !data->ioc_pbuf1) {
+ CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+ return 1;
+ }
+ if (data->ioc_plen2 && !data->ioc_pbuf2) {
+ CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+ return 1;
+ }
+ if (obd_ioctl_packlen(data) > data->ioc_len) {
+ CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+ obd_ioctl_packlen(data), data->ioc_len);
+ return 1;
+ }
+ return 0;
+}
+
+
+#include "obd_support.h"
+
+/* function defined in lustre/obdclass/<platform>/<platform>-module.c */
+int obd_ioctl_getdata(char **buf, int *len, void *arg);
+int obd_ioctl_popdata(void *arg, void *data, int len);
+
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+ OBD_FREE_LARGE(buf, len);
+ return;
+}
+
+/*
+ * BSD ioctl description:
+ * #define IOC_V1 _IOR(g, n1, long)
+ * #define IOC_V2 _IOW(g, n2, long)
+ *
+ * ioctl(f, IOC_V1, arg);
+ * arg will be treated as a long value,
+ *
+ * ioctl(f, IOC_V2, arg)
+ * arg will be treated as a pointer, bsd will call
+ * copyin(buf, arg, sizeof(long))
+ *
+ * To make BSD ioctl handles argument correctly and simplely,
+ * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data
+ * for us. Does this change affect Linux? (XXX Liang)
+ */
+#define OBD_IOC_DATA_TYPE long
+
+#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY _IOW ('f', 104, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_SETATTR _IOW ('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR _IOWR ('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+
+#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC _IOW ('f', 114, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME
+
+#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET _IOW ('f', 136, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139 )
+#define OBD_IOC_NO_TRANSNO _IOW ('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY _IOW ('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE)
+
+#define OBD_GET_VERSION _IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CLOSE_UUID _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CHANGELOG_SEND _IOW ('f', 148, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH _IOWR ('f', 150, OBD_IOC_DATA_TYPE)
+/* see also <lustre/lustre_user.h> for ioctls 151-153 */
+/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */
+#define OBD_IOC_LOV_SETSTRIPE _IOW ('f', 154, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */
+#define OBD_IOC_LOV_GETSTRIPE _IOW ('f', 155, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */
+#define OBD_IOC_LOV_SETEA _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+/* see <lustre/lustre_user.h> for ioctls 157-159 */
+/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */
+#define OBD_IOC_QUOTACHECK _IOW ('f', 160, int)
+/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */
+#define OBD_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *)
+/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */
+#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl)
+/* see also <lustre/lustre_user.h> for ioctls 163-176 */
+#define OBD_IOC_CHANGELOG_REG _IOW ('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG _IOW ('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR _IOW ('f', 179, struct obd_ioctl_data)
+#define OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LLOG_CATINFO is deprecated */
+#define OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE)
+
+#define ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/* <lustre/lustre_user.h> defines ioctl number 218-219 */
+#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t)
+
+#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data)
+
+#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PAUSE_LFSCK _IOW('f', 232, OBD_IOC_DATA_TYPE)
+
+/* XXX _IOWR('f', 250, long) has been defined in
+ * libcfs/include/libcfs/libcfs_private.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+/* Special values for remove LOV EA from disk */
+#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \
+ offset == (typeof(offset))(-1))
+
+/* #define POISON_BULK 0 */
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ * intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending. It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired. At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled. Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ * timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ * Thread1 Thread2
+ *
+ * l_wait_event(&obj->wq, ....); (1)
+ *
+ * wake_up(&obj->wq): (2)
+ * spin_lock(&q->lock); (2.1)
+ * __wake_up_common(q, ...); (2.2)
+ * spin_unlock(&q->lock, flags); (2.3)
+ *
+ * OBD_FREE_PTR(obj); (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+ return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+ long lwi_timeout;
+ long lwi_interval;
+ int lwi_allow_intr;
+ int (*lwi_on_timeout)(void *);
+ void (*lwi_on_signal)(void *);
+ void *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0, \
+ .lwi_allow_intr = 0 \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = interval, \
+ .lwi_allow_intr = 0 \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = time_cb, \
+ .lwi_on_signal = sig_cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0, \
+ .lwi_allow_intr = 0 \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = time_cb, \
+ .lwi_on_signal = sig_cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0, \
+ .lwi_allow_intr = 1 \
+})
+
+#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | \
+ sigmask(SIGTERM) | sigmask(SIGQUIT) | \
+ sigmask(SIGALRM))
+
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait) \
+do { \
+ wait_queue_t __wait; \
+ long __timeout = info->lwi_timeout; \
+ sigset_t __blocked; \
+ int __allow_intr = info->lwi_allow_intr; \
+ \
+ ret = 0; \
+ if (condition) \
+ break; \
+ \
+ init_waitqueue_entry(&__wait, current); \
+ l_add_wait(&wq, &__wait); \
+ \
+ /* Block all signals (just the non-fatal ones if no timeout). */ \
+ if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr)) \
+ __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); \
+ else \
+ __blocked = cfs_block_sigsinv(0); \
+ \
+ for (;;) { \
+ unsigned __wstate; \
+ \
+ __wstate = info->lwi_on_signal != NULL && \
+ (__timeout == 0 || __allow_intr) ? \
+ TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; \
+ \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ \
+ if (condition) \
+ break; \
+ \
+ if (__timeout == 0) { \
+ schedule(); \
+ } else { \
+ long interval = info->lwi_interval? \
+ min_t(long, \
+ info->lwi_interval,__timeout):\
+ __timeout; \
+ long remaining = schedule_timeout(interval);\
+ __timeout = cfs_time_sub(__timeout, \
+ cfs_time_sub(interval, remaining));\
+ if (__timeout == 0) { \
+ if (info->lwi_on_timeout == NULL || \
+ info->lwi_on_timeout(info->lwi_cb_data)) { \
+ ret = -ETIMEDOUT; \
+ break; \
+ } \
+ /* Take signals after the timeout expires. */ \
+ if (info->lwi_on_signal != NULL) \
+ (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+ } \
+ } \
+ \
+ if (condition) \
+ break; \
+ if (cfs_signal_pending()) { \
+ if (info->lwi_on_signal != NULL && \
+ (__timeout == 0 || __allow_intr)) { \
+ if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+ info->lwi_on_signal(info->lwi_cb_data);\
+ ret = -EINTR; \
+ break; \
+ } \
+ /* We have to do this here because some signals */ \
+ /* are not blockable - ie from strace(1). */ \
+ /* In these cases we want to schedule_timeout() */ \
+ /* again, because we don't want that to return */ \
+ /* -EINTR when the RPC actually succeeded. */ \
+ /* the recalc_sigpending() below will deliver the */ \
+ /* signal properly. */ \
+ cfs_clear_sigpending(); \
+ } \
+ } \
+ \
+ cfs_restore_sigs(__blocked); \
+ \
+ set_current_state(TASK_RUNNING); \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+
+
+#define l_wait_event(wq, condition, info) \
+({ \
+ int __ret; \
+ struct l_wait_info *__info = (info); \
+ \
+ __l_wait_event(wq, condition, __info, \
+ __ret, add_wait_queue); \
+ __ret; \
+})
+
+#define l_wait_event_exclusive(wq, condition, info) \
+({ \
+ int __ret; \
+ struct l_wait_info *__info = (info); \
+ \
+ __l_wait_event(wq, condition, __info, \
+ __ret, add_wait_queue_exclusive); \
+ __ret; \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info) \
+({ \
+ int __ret; \
+ struct l_wait_info *__info = (info); \
+ \
+ __l_wait_event(wq, condition, __info, \
+ __ret, add_wait_queue_exclusive_head); \
+ __ret; \
+})
+
+#define l_wait_condition(wq, condition) \
+({ \
+ struct l_wait_info lwi = { 0 }; \
+ l_wait_event(wq, condition, &lwi); \
+})
+
+#define l_wait_condition_exclusive(wq, condition) \
+({ \
+ struct l_wait_info lwi = { 0 }; \
+ l_wait_event_exclusive(wq, condition, &lwi); \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition) \
+({ \
+ struct l_wait_info lwi = { 0 }; \
+ l_wait_event_exclusive_head(wq, condition, &lwi); \
+})
+
+#define LIBLUSTRE_CLIENT (0)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h
new file mode 100644
index 000000000..df557c22a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_lite.h
@@ -0,0 +1,150 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LL_H
+#define _LL_H
+
+/** \defgroup lite lite
+ *
+ * @{
+ */
+
+#include "linux/lustre_lite.h"
+
+#include "obd_class.h"
+#include "lustre_net.h"
+#include "lustre_mds.h"
+#include "lustre_ha.h"
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS (22)
+#define LL_MAX_BLKSIZE (1UL<<LL_MAX_BLKSIZE_BITS)
+
+#include "lustre/lustre_user.h"
+
+
+struct lustre_rw_params {
+ int lrp_lock_mode;
+ ldlm_policy_data_t lrp_policy;
+ u32 lrp_brw_flags;
+ int lrp_ast_flags;
+};
+
+/*
+ * XXX nikita: this function lives in the header because it is used by both
+ * llite kernel module and liblustre library, and there is no (?) better place
+ * to put it in.
+ */
+static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
+ __u64 connect_flags,
+ loff_t pos, ssize_t len,
+ struct lustre_rw_params *params)
+{
+ params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW;
+ params->lrp_brw_flags = 0;
+
+ params->lrp_policy.l_extent.start = pos;
+ params->lrp_policy.l_extent.end = pos + len - 1;
+ /*
+ * for now O_APPEND always takes local locks.
+ */
+ if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) {
+ params->lrp_policy.l_extent.start = 0;
+ params->lrp_policy.l_extent.end = OBD_OBJECT_EOF;
+ } else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) {
+ /*
+ * liblustre: OST-side locking for all non-O_APPEND
+ * reads/writes.
+ */
+ params->lrp_lock_mode = LCK_NL;
+ params->lrp_brw_flags = OBD_BRW_SRVLOCK;
+ } else {
+ /*
+ * nothing special for the kernel. In the future llite may use
+ * OST-side locks for small writes into highly contended
+ * files.
+ */
+ }
+ params->lrp_ast_flags = (open_flags & O_NONBLOCK) ?
+ LDLM_FL_BLOCK_NOWAIT : 0;
+}
+
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+ /*
+ * This is conjunction of connect_flags across all imports (LOVs) this
+ * mount is connected to. This field is updated by cl_ocd_update()
+ * under ->lco_lock.
+ */
+ __u64 lco_flags;
+ struct mutex lco_lock;
+ struct obd_export *lco_md_exp;
+ struct obd_export *lco_dt_exp;
+};
+
+/*
+ * Chain of hash overflow pages.
+ */
+struct ll_dir_chain {
+ /* XXX something. Later */
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+ if (BITS_PER_LONG == 32 && hash64)
+ hash >>= 32;
+ /* save hash 0 as index 0 because otherwise we'll save it at
+ * page index end (~0UL) and it causes truncate_inode_pages_range()
+ * to loop forever.
+ */
+ return ~0UL - (hash + !hash);
+}
+
+/** @} lite */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h
new file mode 100644
index 000000000..2187fb615
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_log.h
@@ -0,0 +1,545 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include "obd_class.h"
+#include "lustre/lustre_idl.h"
+#include "dt_object.h"
+
+#define LOG_NAME_LIMIT(logname, name) \
+ snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+ LLOG_OPEN_EXISTS = 0x0000,
+ LLOG_OPEN_NEW = 0x0001,
+};
+
+struct plain_handle_data {
+ struct list_head phd_entry;
+ struct llog_handle *phd_cat_handle;
+ struct llog_cookie phd_cookie; /* cookie of this log in its cat */
+};
+
+struct cat_handle_data {
+ struct list_head chd_head;
+ struct llog_handle *chd_current_log; /* currently open log */
+ struct llog_handle *chd_next_log; /* llog to be used next */
+};
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+ /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+ * logid's by non-zero ogen (inode generation) and convert them
+ * into IGIF */
+ if (id->lgl_ogen == 0) {
+ fid->f_seq = id->lgl_oi.oi.oi_seq;
+ fid->f_oid = id->lgl_oi.oi.oi_id;
+ fid->f_ver = 0;
+ } else {
+ lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+ }
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+ id->lgl_oi.oi.oi_seq = fid->f_seq;
+ id->lgl_oi.oi.oi_id = fid->f_oid;
+ id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+ log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+ return log_id->lgl_oi.oi.oi_id;
+}
+
+struct llog_handle;
+
+/* llog.c - general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+ int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+ struct llog_handle *loghandle, llog_cb_t cb,
+ void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+ int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **lgh, struct llog_logid *logid,
+ char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name);
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+ struct llog_ctxt *ctxt, struct llog_ctxt *bak_ctxt,
+ char *name, char *backup);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+ /**
+ * Any useful data needed while processing catalog. This is
+ * passed later to process callback.
+ */
+ void *lpd_data;
+ /**
+ * Catalog process callback function, called for each record
+ * in catalog.
+ */
+ llog_cb_t lpd_cb;
+ /**
+ * Start processing the catalog from startcat/startidx
+ */
+ int lpd_startcat;
+ int lpd_startidx;
+};
+
+struct llog_process_cat_data {
+ /**
+ * Temporary stored first_idx while scanning log.
+ */
+ int lpcd_first_idx;
+ /**
+ * Temporary stored last_idx while scanning log.
+ */
+ int lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf, struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+ struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf);
+int llog_cat_cancel_records(const struct lu_env *env,
+ struct llog_handle *cathandle, int count,
+ struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+ struct llog_handle *cat_llh, llog_cb_t cb,
+ void *data, int startcat, int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data, int startcat, int startidx);
+int llog_cat_reverse_process(const struct lu_env *env,
+ struct llog_handle *cat_llh, llog_cb_t cb,
+ void *data);
+int llog_cat_init_and_process(const struct lu_env *env,
+ struct llog_handle *llh);
+
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+ struct obd_llog_group *olg, int index,
+ struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_cookie *cookies, int flags);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+ int (*lop_destroy)(const struct lu_env *env,
+ struct llog_handle *handle);
+ int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+ int *curr_idx, int next_idx, __u64 *offset,
+ void *buf, int len);
+ int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+ int prev_idx, void *buf, int len);
+ int (*lop_read_header)(const struct lu_env *env,
+ struct llog_handle *handle);
+ int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+ struct obd_llog_group *olg, int ctxt_idx,
+ struct obd_device *disk_obd);
+ int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+ int flags);
+ int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+ int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_cookie *cookies, int flags);
+ int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+ struct llog_gen *gen, struct obd_uuid *uuid);
+ /**
+ * Any llog file must be opened first using llog_open(). Llog can be
+ * opened by name, logid or without both, in last case the new logid
+ * will be generated.
+ */
+ int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_logid *logid, char *name,
+ enum llog_open_param);
+ /**
+ * Opened llog may not exist and this must be checked where needed using
+ * the llog_exist() call.
+ */
+ int (*lop_exist)(struct llog_handle *lgh);
+ /**
+ * Close llog file and calls llog_free_handle() implicitly.
+ * Any opened llog must be closed by llog_close() call.
+ */
+ int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+ /**
+ * Create new llog file. The llog must be opened.
+ * Must be used only for local llog operations.
+ */
+ int (*lop_declare_create)(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct thandle *th);
+ int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+ struct thandle *th);
+ /**
+ * write new record in llog. It appends records usually but can edit
+ * existing records too.
+ */
+ int (*lop_declare_write_rec)(const struct lu_env *env,
+ struct llog_handle *lgh,
+ struct llog_rec_hdr *rec,
+ int idx, struct thandle *th);
+ int (*lop_write_rec)(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec,
+ struct llog_cookie *cookie, int cookiecount,
+ void *buf, int idx, struct thandle *th);
+ /**
+ * Add new record in llog catalog. Does the same as llog_write_rec()
+ * but using llog catalog.
+ */
+ int (*lop_declare_add)(const struct lu_env *env,
+ struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct thandle *th);
+ int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+ void *buf, struct thandle *th);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+ struct rw_semaphore lgh_lock;
+ spinlock_t lgh_hdr_lock; /* protect lgh_hdr data */
+ struct llog_logid lgh_id; /* id of this log */
+ struct llog_log_hdr *lgh_hdr;
+ struct file *lgh_file;
+ struct dt_object *lgh_obj;
+ int lgh_last_idx;
+ int lgh_cur_idx; /* used during llog_process */
+ __u64 lgh_cur_offset; /* used during llog_process */
+ struct llog_ctxt *lgh_ctxt;
+ union {
+ struct plain_handle_data phd;
+ struct cat_handle_data chd;
+ } u;
+ char *lgh_name;
+ void *private_data;
+ struct llog_operations *lgh_logops;
+ atomic_t lgh_refcount;
+};
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001
+#define LLOG_CTXT_FLAG_STOP 0x00000002
+
+struct llog_ctxt {
+ int loc_idx; /* my index the obd array of ctxt's */
+ struct obd_device *loc_obd; /* points back to the containing obd*/
+ struct obd_llog_group *loc_olg; /* group containing that ctxt */
+ struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */
+ struct obd_import *loc_imp; /* to use in RPC's: can be backward
+ pointing import */
+ struct llog_operations *loc_logops;
+ struct llog_handle *loc_handle;
+ struct mutex loc_mutex; /* protect loc_imp */
+ atomic_t loc_refcount;
+ long loc_flags; /* flags, see above defines */
+ struct dt_object *loc_dir;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+ struct llog_operations **lop)
+{
+ if (ctxt == NULL)
+ return -ENOTCONN;
+
+ *lop = ctxt->loc_logops;
+ if (*lop == NULL)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+ struct llog_operations **lop)
+{
+ if (loghandle == NULL || loghandle->lgh_logops == NULL)
+ return -EINVAL;
+
+ *lop = loghandle->lgh_logops;
+ return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+ return cfs_size_round(len);
+}
+
+static inline int llog_get_size(struct llog_handle *loghandle)
+{
+ if (loghandle && loghandle->lgh_hdr)
+ return loghandle->lgh_hdr->llh_count;
+ return 0;
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+ atomic_inc(&ctxt->loc_refcount);
+ CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+ atomic_read(&ctxt->loc_refcount));
+ return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+ if (ctxt == NULL)
+ return;
+ LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+ CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+ atomic_read(&ctxt->loc_refcount) - 1);
+ __llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg, int group)
+{
+ init_waitqueue_head(&olg->olg_waitq);
+ spin_lock_init(&olg->olg_lock);
+ mutex_init(&olg->olg_cat_processing);
+ olg->olg_seq = group;
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+ struct llog_ctxt *ctxt, int index)
+{
+ LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+ spin_lock(&olg->olg_lock);
+ if (olg->olg_ctxts[index] != NULL) {
+ spin_unlock(&olg->olg_lock);
+ return -EEXIST;
+ }
+ olg->olg_ctxts[index] = ctxt;
+ spin_unlock(&olg->olg_lock);
+ return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+ int index)
+{
+ struct llog_ctxt *ctxt;
+
+ LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+ spin_lock(&olg->olg_lock);
+ if (olg->olg_ctxts[index] == NULL)
+ ctxt = NULL;
+ else
+ ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+ spin_unlock(&olg->olg_lock);
+ return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+ LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+ spin_lock(&olg->olg_lock);
+ olg->olg_ctxts[index] = NULL;
+ spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+ int index)
+{
+ return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+ return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+ return llog_group_ctxt_null(&obd->obd_olg, index);
+}
+
+static inline int llog_destroy(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ return rc;
+ if (lop->lop_destroy == NULL)
+ return -EOPNOTSUPP;
+
+ rc = lop->lop_destroy(env, handle);
+ return rc;
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+ struct llog_handle *loghandle, int *cur_idx,
+ int next_idx, __u64 *cur_offset, void *buf,
+ int len)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ return rc;
+ if (lop->lop_next_block == NULL)
+ return -EOPNOTSUPP;
+
+ rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+ cur_offset, buf, len);
+ return rc;
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ return rc;
+ if (lop->lop_prev_block == NULL)
+ return -EOPNOTSUPP;
+
+ rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+ return rc;
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+ struct llog_logid *logid, struct llog_gen *gen,
+ struct obd_uuid *uuid)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_obd2ops(ctxt, &lop);
+ if (rc)
+ return rc;
+ if (lop->lop_connect == NULL)
+ return -EOPNOTSUPP;
+
+ rc = lop->lop_connect(ctxt, logid, gen, uuid);
+ return rc;
+}
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+ struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+ struct thandle *th);
+int llog_declare_write_rec(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, int idx,
+ struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ int numcookies, void *buf, int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ void *buf, struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **res, struct llog_logid *logid,
+ char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ int cookiecount, void *buf, int idx);
+
+/** @} log */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h
new file mode 100644
index 000000000..b1b05c8a3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h
@@ -0,0 +1,191 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include "lustre_intent.h"
+#include "lustre_handles.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "obd_class.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_lib.h"
+#include "lustre_dlm.h"
+#include "lustre_export.h"
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+struct mdc_rpc_lock {
+ struct mutex rpcl_mutex;
+ struct lookup_intent *rpcl_it;
+ int rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+ mutex_init(&lck->rpcl_mutex);
+ lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+ struct lookup_intent *it)
+{
+ if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+ it->it_op == IT_LAYOUT))
+ return;
+
+ /* This would normally block until the existing request finishes.
+ * If fail_loc is set it will block until the regular request is
+ * done, then set rpcl_it to MDC_FAKE_RPCL_IT. Once that is set
+ * it will only be cleared when all fake requests are finished.
+ * Only when all fake requests are finished can normal requests
+ * be sent, to ensure they are recoverable again. */
+ again:
+ mutex_lock(&lck->rpcl_mutex);
+
+ if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+ lck->rpcl_it = MDC_FAKE_RPCL_IT;
+ lck->rpcl_fakes++;
+ mutex_unlock(&lck->rpcl_mutex);
+ return;
+ }
+
+ /* This will only happen when the CFS_FAIL_CHECK() was
+ * just turned off but there are still requests in progress.
+ * Wait until they finish. It doesn't need to be efficient
+ * in this extremely rare case, just have low overhead in
+ * the common case when it isn't true. */
+ while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+ mutex_unlock(&lck->rpcl_mutex);
+ schedule_timeout(cfs_time_seconds(1) / 4);
+ goto again;
+ }
+
+ LASSERT(lck->rpcl_it == NULL);
+ lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+ struct lookup_intent *it)
+{
+ if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
+ it->it_op == IT_LAYOUT))
+ return;
+
+ if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+ mutex_lock(&lck->rpcl_mutex);
+
+ LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+ lck->rpcl_fakes--;
+
+ if (lck->rpcl_fakes == 0)
+ lck->rpcl_it = NULL;
+
+ } else {
+ LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+ lck->rpcl_it = NULL;
+ }
+
+ mutex_unlock(&lck->rpcl_mutex);
+}
+
+/* Update the maximum observed easize and cookiesize. The default easize
+ * and cookiesize is initialized to the minimum value but allowed to grow
+ * up to a single page in size if required to handle the common case.
+ */
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+ struct mdt_body *body)
+{
+ if (body->valid & OBD_MD_FLMODEASIZE) {
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+
+ if (cli->cl_max_mds_easize < body->max_mdsize) {
+ cli->cl_max_mds_easize = body->max_mdsize;
+ cli->cl_default_mds_easize =
+ min_t(__u32, body->max_mdsize, PAGE_CACHE_SIZE);
+ }
+ if (cli->cl_max_mds_cookiesize < body->max_cookiesize) {
+ cli->cl_max_mds_cookiesize = body->max_cookiesize;
+ cli->cl_default_mds_cookiesize =
+ min_t(__u32, body->max_cookiesize, PAGE_CACHE_SIZE);
+ }
+ }
+}
+
+
+struct mdc_cache_waiter {
+ struct list_head mcw_entry;
+ wait_queue_head_t mcw_waitq;
+};
+
+/* mdc/mdc_locks.c */
+int it_disposition(struct lookup_intent *it, int flag);
+void it_clear_disposition(struct lookup_intent *it, int flag);
+void it_set_disposition(struct lookup_intent *it, int flag);
+int it_open_error(int phase, struct lookup_intent *it);
+
+static inline bool cl_is_lov_delay_create(unsigned int flags)
+{
+ return (flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE;
+}
+
+static inline void cl_lov_delay_create_clear(unsigned int *flags)
+{
+ if ((*flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE)
+ *flags &= ~O_LOV_DELAY_CREATE;
+}
+
+/** @} mdc */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h
new file mode 100644
index 000000000..f0cce41c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_mds.h
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include "lustre_handles.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_lib.h"
+#include "lustre_dlm.h"
+#include "lustre_export.h"
+
+struct mds_group_info {
+ struct obd_uuid *uuid;
+ int group;
+};
+
+struct mds_capa_info {
+ struct obd_uuid *uuid;
+ struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME "mdd_obd"
+#define MDD_OBD_UUID "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+ return !(flags & MDS_OPEN_DELAY_CREATE ||
+ !(flags & FMODE_WRITE));
+}
+
+/* these are local flags, used only on the client, private */
+#define M_CHECK_STALE 0200000000
+
+/** @} mds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
new file mode 100644
index 000000000..e2805bd1a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -0,0 +1,2967 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+// #include <obd.h>
+#include "../../include/linux/lnet/lnet.h"
+#include "lustre/lustre_idl.h"
+#include "lustre_ha.h"
+#include "lustre_sec.h"
+#include "lustre_import.h"
+#include "lprocfs_status.h"
+#include "lu_object.h"
+#include "lustre_req_layout.h"
+
+#include "obd_support.h"
+#include "lustre_ver.h"
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS 0
+
+/**
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value. The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS 2
+#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all. Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future. Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE (1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+#define ONE_MB_BRW_SIZE (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define OFD_MAX_BRW_SIZE (1 << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+# error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE))
+# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_SIZE too big"
+# endif
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+# error "PTLRPC_MAX_BRW_PAGES too big"
+# endif
+
+#define PTLRPC_NTHRS_INIT 2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS # buffers to allocate when growing the pool
+ * ?_BUFSIZE # bytes in a single request buffer
+ * ?_MAXREQSIZE # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT # threads to create for each service partition on
+ * initializing. If it's non-affinity service and
+ * there is only one partition, it's the overall #
+ * threads for the service while initializing.
+ * ?_NTHRS_BASE # threads should be created at least for each
+ * ptlrpc partition to keep the service healthy.
+ * It's the low-water mark of threads upper-limit
+ * for each partition.
+ * ?_THR_FACTOR # threads can be added on threads upper-limit for
+ * each CPU core. This factor is only for reference,
+ * we might decrease value of factor if number of cores
+ * per CPT is above a limit.
+ * ?_NTHRS_MAX # overall threads can be created for a service,
+ * it's a soft limit because if service is running
+ * on machine with hundreds of cores and tens of
+ * CPU partitions, we need to guarantee each partition
+ * has ?_NTHRS_BASE threads, which means total threads
+ * will be ?_NTHRS_BASE * number_of_cpts which can
+ * exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT 2
+ * #define MDS_NTHRS_BASE 64
+ * #define MDS_NTHRS_FACTOR 8
+ * #define MDS_NTHRS_MAX 1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ * 96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ * 128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ * 160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ * MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ * MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ * filesystem itself, buffer cache, or underlying network stack might
+ * have some SMP scalability issues at that large scale.
+ *
+ * If user already has a fat machine with hundreds or thousands of cores,
+ * there are two choices for configuration:
+ * a) create CPU table from subset of all CPUs and run Lustre on
+ * top of this subset
+ * b) bind service threads on a few partitions, see modparameters of
+ * MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ * understanding, the real implementation is a little more complex,
+ * please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+ * LDLM threads constants:
+ *
+ * Given 8 as factor and 24 as base threads number
+ *
+ * example 1)
+ * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+ *
+ * example 2)
+ * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+ * threads for each partition and total threads number will be 112.
+ *
+ * example 3)
+ * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+ * threads for each partition to keep service healthy, so total threads
+ * number should be 24 * 8 = 192.
+ *
+ * So with these constants, threads number will be at the similar level
+ * of old versions, unless target machine has over a hundred cores
+ */
+#define LDLM_THR_FACTOR 8
+#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE 24
+#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE (8 * 1024)
+#define LDLM_MAXREQSIZE (5 * 1024)
+#define LDLM_MAXREPSIZE (1024)
+
+#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */
+
+#define OST_MAXREQSIZE (5 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+ /** linkage for connections hash table */
+ struct hlist_node c_hash;
+ /** Our own lnet nid for this connection */
+ lnet_nid_t c_self;
+ /** Remote side nid for this connection */
+ lnet_process_id_t c_peer;
+ /** UUID of the other side */
+ struct obd_uuid c_remote_uuid;
+ /** reference counter for this connection */
+ atomic_t c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+ /** What lnet portal does this client send messages to by default */
+ __u32 cli_request_portal;
+ /** What portal do we expect replies on */
+ __u32 cli_reply_portal;
+ /** Name of the client */
+ char *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+ /**
+ * Scratchpad for passing args to completion interpreter. Users
+ * cast to the struct of their choosing, and CLASSERT that this is
+ * big enough. For _tons_ of context, OBD_ALLOC a struct and store
+ * a pointer to it here. The pointer_arg ensures this struct is at
+ * least big enough for that.
+ */
+ void *pointer_arg[11];
+ __u64 space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+ atomic_t set_refcount;
+ /** number of in queue requests */
+ atomic_t set_new_count;
+ /** number of uncompleted requests */
+ atomic_t set_remaining;
+ /** wait queue to wait on for request events */
+ wait_queue_head_t set_waitq;
+ wait_queue_head_t *set_wakeup_ptr;
+ /** List of requests in the set */
+ struct list_head set_requests;
+ /**
+ * List of completion callbacks to be called when the set is completed
+ * This is only used if \a set_interpret is NULL.
+ * Links struct ptlrpc_set_cbdata.
+ */
+ struct list_head set_cblist;
+ /** Completion callback, if only one. */
+ set_interpreter_func set_interpret;
+ /** opaq argument passed to completion \a set_interpret callback. */
+ void *set_arg;
+ /**
+ * Lock for \a set_new_requests manipulations
+ * locked so that any old caller can communicate requests to
+ * the set holder who can then fold them into the lock-free set
+ */
+ spinlock_t set_new_req_lock;
+ /** List of new yet unsent requests. Only used with ptlrpcd now. */
+ struct list_head set_new_requests;
+
+ /** rq_status of requests that have been freed already */
+ int set_rc;
+ /** Additional fields used by the flow control extension */
+ /** Maximum number of RPCs in flight */
+ int set_max_inflight;
+ /** Callback function used to generate RPCs */
+ set_producer_func set_producer;
+ /** opaq argument passed to the producer callback */
+ void *set_producer_arg;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+ /** List linkage item */
+ struct list_head psc_item;
+ /** Pointer to interpreting function */
+ set_interpreter_func psc_interpret;
+ /** Opaq argument to pass to the callback */
+ void *psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+ void (*cbid_fn)(lnet_event_t *ev); /* specific callback fn */
+ void *cbid_arg; /* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG 0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+ /** Callback description */
+ struct ptlrpc_cb_id rs_cb_id;
+ /** Linkage for list of all reply states in a system */
+ struct list_head rs_list;
+ /** Linkage for list of all reply states on same export */
+ struct list_head rs_exp_list;
+ /** Linkage for list of all reply states for same obd */
+ struct list_head rs_obd_list;
+#if RS_DEBUG
+ struct list_head rs_debug_list;
+#endif
+ /** A spinlock to protect the reply state flags */
+ spinlock_t rs_lock;
+ /** Reply state flags */
+ unsigned long rs_difficult:1; /* ACK/commit stuff */
+ unsigned long rs_no_ack:1; /* no ACK, even for
+ difficult requests */
+ unsigned long rs_scheduled:1; /* being handled? */
+ unsigned long rs_scheduled_ever:1;/* any schedule attempts? */
+ unsigned long rs_handled:1; /* been handled yet? */
+ unsigned long rs_on_net:1; /* reply_out_callback pending? */
+ unsigned long rs_prealloc:1; /* rs from prealloc list */
+ unsigned long rs_committed:1;/* the transaction was committed
+ and the rs was dispatched
+ by ptlrpc_commit_replies */
+ /** Size of the state */
+ int rs_size;
+ /** opcode */
+ __u32 rs_opc;
+ /** Transaction number */
+ __u64 rs_transno;
+ /** xid */
+ __u64 rs_xid;
+ struct obd_export *rs_export;
+ struct ptlrpc_service_part *rs_svcpt;
+ /** Lnet metadata handle for the reply */
+ lnet_handle_md_t rs_md_h;
+ atomic_t rs_refcount;
+
+ /** Context for the service thread */
+ struct ptlrpc_svc_ctx *rs_svc_ctx;
+ /** Reply buffer (actually sent to the client), encoded if needed */
+ struct lustre_msg *rs_repbuf; /* wrapper */
+ /** Size of the reply buffer */
+ int rs_repbuf_len; /* wrapper buf length */
+ /** Size of the reply message */
+ int rs_repdata_len; /* wrapper msg length */
+ /**
+ * Actual reply message. Its content is encrypted (if needed) to
+ * produce reply buffer for actual sending. In simple case
+ * of no network encryption we just set \a rs_repbuf to \a rs_msg
+ */
+ struct lustre_msg *rs_msg; /* reply message */
+
+ /** Number of locks awaiting client ACK */
+ int rs_nlocks;
+ /** Handles of locks awaiting client reply ACK */
+ struct lustre_handle rs_locks[RS_MAX_LOCKS];
+ /** Lock modes of locks in \a rs_locks */
+ ldlm_mode_t rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+ RQ_PHASE_NEW = 0xebc0de00,
+ RQ_PHASE_RPC = 0xebc0de01,
+ RQ_PHASE_BULK = 0xebc0de02,
+ RQ_PHASE_INTERPRET = 0xebc0de03,
+ RQ_PHASE_COMPLETE = 0xebc0de04,
+ RQ_PHASE_UNREGISTERING = 0xebc0de05,
+ RQ_PHASE_UNDEFINED = 0xebc0de06
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *arg, int rc);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+ /** Locks the list */
+ spinlock_t prp_lock;
+ /** list of ptlrpc_request structs */
+ struct list_head prp_req_list;
+ /** Maximum message size that would fit into a request from this pool */
+ int prp_rq_size;
+ /** Function to allocate more requests for this pool */
+ void (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+ /**
+ * Not a valid opcode.
+ */
+ PTLRPC_NRS_CTL_INVALID,
+ /**
+ * Activate the policy.
+ */
+ PTLRPC_NRS_CTL_START,
+ /**
+ * Reserved for multiple primary policies, which may be a possibility
+ * in the future.
+ */
+ PTLRPC_NRS_CTL_STOP,
+ /**
+ * Policies can start using opcodes from this value and onwards for
+ * their own purposes; the assigned value itself is arbitrary.
+ */
+ PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+ NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+ NRS_CTL_ORR_WR_QUANTUM,
+ NRS_CTL_ORR_RD_OFF_TYPE,
+ NRS_CTL_ORR_WR_OFF_TYPE,
+ NRS_CTL_ORR_RD_SUPP_REQ,
+ NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+ /**
+ * Called during policy registration; this operation is optional.
+ *
+ * \param[in,out] policy The policy being initialized
+ */
+ int (*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Called during policy unregistration; this operation is optional.
+ *
+ * \param[in,out] policy The policy being unregistered/finalized
+ */
+ void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Called when activating a policy via lprocfs; policies allocate and
+ * initialize their resources here; this operation is optional.
+ *
+ * \param[in,out] policy The policy being started
+ *
+ * \see nrs_policy_start_locked()
+ */
+ int (*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Called when deactivating a policy via lprocfs; policies deallocate
+ * their resources here; this operation is optional
+ *
+ * \param[in,out] policy The policy being stopped
+ *
+ * \see nrs_policy_stop0()
+ */
+ void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+ /**
+ * Used for policy-specific operations; i.e. not generic ones like
+ * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+ * to an ioctl; this operation is optional.
+ *
+ * \param[in,out] policy The policy carrying out operation \a opc
+ * \param[in] opc The command operation being carried out
+ * \param[in,out] arg An generic buffer for communication between the
+ * user and the control operation
+ *
+ * \retval -ve error
+ * \retval 0 success
+ *
+ * \see ptlrpc_nrs_policy_control()
+ */
+ int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+ enum ptlrpc_nrs_ctl opc, void *arg);
+
+ /**
+ * Called when obtaining references to the resources of the resource
+ * hierarchy for a request that has arrived for handling at the PTLRPC
+ * service. Policies should return -ve for requests they do not wish
+ * to handle. This operation is mandatory.
+ *
+ * \param[in,out] policy The policy we're getting resources for.
+ * \param[in,out] nrq The request we are getting resources for.
+ * \param[in] parent The parent resource of the resource being
+ * requested; set to NULL if none.
+ * \param[out] resp The resource is to be returned here; the
+ * fallback policy in an NRS head should
+ * \e always return a non-NULL pointer value.
+ * \param[in] moving_req When set, signifies that this is an attempt
+ * to obtain resources for a request being moved
+ * to the high-priority NRS head by
+ * ldlm_lock_reorder_req().
+ * This implies two things:
+ * 1. We are under obd_export::exp_rpc_lock and
+ * so should not sleep.
+ * 2. We should not perform non-idempotent or can
+ * skip performing idempotent operations that
+ * were carried out when resources were first
+ * taken for the request when it was initialized
+ * in ptlrpc_nrs_req_initialize().
+ *
+ * \retval 0, +ve The level of the returned resource in the resource
+ * hierarchy; currently only 0 (for a non-leaf resource)
+ * and 1 (for a leaf resource) are supported by the
+ * framework.
+ * \retval -ve error
+ *
+ * \see ptlrpc_nrs_req_initialize()
+ * \see ptlrpc_nrs_hpreq_add_nolock()
+ * \see ptlrpc_nrs_req_hp_move()
+ */
+ int (*op_res_get) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq,
+ const struct ptlrpc_nrs_resource *parent,
+ struct ptlrpc_nrs_resource **resp,
+ bool moving_req);
+ /**
+ * Called when releasing references taken for resources in the resource
+ * hierarchy for the request; this operation is optional.
+ *
+ * \param[in,out] policy The policy the resource belongs to
+ * \param[in] res The resource to be freed
+ *
+ * \see ptlrpc_nrs_req_finalize()
+ * \see ptlrpc_nrs_hpreq_add_nolock()
+ * \see ptlrpc_nrs_req_hp_move()
+ */
+ void (*op_res_put) (struct ptlrpc_nrs_policy *policy,
+ const struct ptlrpc_nrs_resource *res);
+
+ /**
+ * Obtains a request for handling from the policy, and optionally
+ * removes the request from the policy; this operation is mandatory.
+ *
+ * \param[in,out] policy The policy to poll
+ * \param[in] peek When set, signifies that we just want to
+ * examine the request, and not handle it, so the
+ * request is not removed from the policy.
+ * \param[in] force When set, it will force a policy to return a
+ * request if it has one queued.
+ *
+ * \retval NULL No request available for handling
+ * \retval valid-pointer The request polled for handling
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ */
+ struct ptlrpc_nrs_request *
+ (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+ bool force);
+ /**
+ * Called when attempting to add a request to a policy for later
+ * handling; this operation is mandatory.
+ *
+ * \param[in,out] policy The policy on which to enqueue \a nrq
+ * \param[in,out] nrq The request to enqueue
+ *
+ * \retval 0 success
+ * \retval != 0 error
+ *
+ * \see ptlrpc_nrs_req_add_nolock()
+ */
+ int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq);
+ /**
+ * Removes a request from the policy's set of pending requests. Normally
+ * called after a request has been polled successfully from the policy
+ * for handling; this operation is mandatory.
+ *
+ * \param[in,out] policy The policy the request \a nrq belongs to
+ * \param[in,out] nrq The request to dequeue
+ *
+ * \see ptlrpc_nrs_req_del_nolock()
+ */
+ void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq);
+ /**
+ * Called after the request being carried out. Could be used for
+ * job/resource control; this operation is optional.
+ *
+ * \param[in,out] policy The policy which is stopping to handle request
+ * \a nrq
+ * \param[in,out] nrq The request
+ *
+ * \pre assert_spin_locked(&svcpt->scp_req_lock)
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+ void (*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq);
+ /**
+ * Registers the policy's lprocfs interface with a PTLRPC service.
+ *
+ * \param[in] svc The service
+ *
+ * \retval 0 success
+ * \retval != 0 error
+ */
+ int (*op_lprocfs_init) (struct ptlrpc_service *svc);
+ /**
+ * Unegisters the policy's lprocfs interface with a PTLRPC service.
+ *
+ * In cases of failed policy registration in
+ * \e ptlrpc_nrs_policy_register(), this function may be called for a
+ * service which has not registered the policy successfully, so
+ * implementations of this method should make sure their operations are
+ * safe in such cases.
+ *
+ * \param[in] svc The service
+ */
+ void (*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+ /**
+ * Fallback policy, use this flag only on a single supported policy per
+ * service. The flag cannot be used on policies that use
+ * \e PTLRPC_NRS_FL_REG_EXTERN
+ */
+ PTLRPC_NRS_FL_FALLBACK = (1 << 0),
+ /**
+ * Start policy immediately after registering.
+ */
+ PTLRPC_NRS_FL_REG_START = (1 << 1),
+ /**
+ * This is a policy registering from a module different to the one NRS
+ * core ships in (currently ptlrpc).
+ */
+ PTLRPC_NRS_FL_REG_EXTERN = (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+ PTLRPC_NRS_QUEUE_REG = (1 << 0),
+ PTLRPC_NRS_QUEUE_HP = (1 << 1),
+ PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ * was initialized.
+ * - when the primary policy that was at the
+ * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ * RPC was initialized, denoted it did not wish, or for some other reason was
+ * not able to handle the request, by returning a non-valid NRS resource
+ * reference.
+ * - when the primary policy that was at the
+ * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ * RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+ spinlock_t nrs_lock;
+ /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+ /**
+ * List of registered policies
+ */
+ struct list_head nrs_policy_list;
+ /**
+ * List of policies with queued requests. Policies that have any
+ * outstanding requests are queued here, and this list is queried
+ * in a round-robin manner from NRS core when obtaining a request
+ * for handling. This ensures that requests from policies that at some
+ * point transition away from the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+ */
+ struct list_head nrs_policy_queued;
+ /**
+ * Service partition for this NRS head
+ */
+ struct ptlrpc_service_part *nrs_svcpt;
+ /**
+ * Primary policy, which is the preferred policy for handling RPCs
+ */
+ struct ptlrpc_nrs_policy *nrs_policy_primary;
+ /**
+ * Fallback policy, which is the backup policy for handling RPCs
+ */
+ struct ptlrpc_nrs_policy *nrs_policy_fallback;
+ /**
+ * This NRS head handles either HP or regular requests
+ */
+ enum ptlrpc_nrs_queue_type nrs_queue_type;
+ /**
+ * # queued requests from all policies in this NRS head
+ */
+ unsigned long nrs_req_queued;
+ /**
+ * # scheduled requests from all policies in this NRS head
+ */
+ unsigned long nrs_req_started;
+ /**
+ * # policies on this NRS
+ */
+ unsigned nrs_num_pols;
+ /**
+ * This NRS head is in progress of starting a policy
+ */
+ unsigned nrs_policy_starting:1;
+ /**
+ * In progress of shutting down the whole NRS head; used during
+ * unregistration
+ */
+ unsigned nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX 16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+ const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+ /**
+ * Human-readable policy name
+ */
+ char nc_name[NRS_POL_NAME_MAX];
+ /**
+ * NRS operations for this policy
+ */
+ const struct ptlrpc_nrs_pol_ops *nc_ops;
+ /**
+ * Service compatibility predicate
+ */
+ nrs_pol_desc_compat_t nc_compat;
+ /**
+ * Set for policies that support a single ptlrpc service, i.e. ones that
+ * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+ * depicts the name of the single service that such policies are
+ * compatible with.
+ */
+ const char *nc_compat_svc_name;
+ /**
+ * Owner module for this policy descriptor; policies registering from a
+ * different module to the one the NRS framework is held within
+ * (currently ptlrpc), should set this field to THIS_MODULE.
+ */
+ struct module *nc_owner;
+ /**
+ * Policy registration flags; a bitmask of \e nrs_policy_flags
+ */
+ unsigned nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+ /**
+ * Human-readable policy name
+ */
+ char pd_name[NRS_POL_NAME_MAX];
+ /**
+ * Link into nrs_core::nrs_policies
+ */
+ struct list_head pd_list;
+ /**
+ * NRS operations for this policy
+ */
+ const struct ptlrpc_nrs_pol_ops *pd_ops;
+ /**
+ * Service compatibility predicate
+ */
+ nrs_pol_desc_compat_t pd_compat;
+ /**
+ * Set for policies that are compatible with only one PTLRPC service.
+ *
+ * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+ */
+ const char *pd_compat_svc_name;
+ /**
+ * Owner module for this policy descriptor.
+ *
+ * We need to hold a reference to the module whenever we might make use
+ * of any of the module's contents, i.e.
+ * - If one or more instances of the policy are at a state where they
+ * might be handling a request, i.e.
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+ * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+ * is taken on the module when
+ * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+ * becomes 0, so that we hold only one reference to the module maximum
+ * at any time.
+ *
+ * We do not need to hold a reference to the module, even though we
+ * might use code and data from the module, in the following cases:
+ * - During external policy registration, because this should happen in
+ * the module's init() function, in which case the module is safe from
+ * removal because a reference is being held on the module by the
+ * kernel, and iirc kmod (and I guess module-init-tools also) will
+ * serialize any racing processes properly anyway.
+ * - During external policy unregistration, because this should happen
+ * in a module's exit() function, and any attempts to start a policy
+ * instance would need to take a reference on the module, and this is
+ * not possible once we have reached the point where the exit()
+ * handler is called.
+ * - During service registration and unregistration, as service setup
+ * and cleanup, and policy registration, unregistration and policy
+ * instance starting, are serialized by \e nrs_core::nrs_mutex, so
+ * as long as users adhere to the convention of registering policies
+ * in init() and unregistering them in module exit() functions, there
+ * should not be a race between these operations.
+ * - During any policy-specific lprocfs operations, because a reference
+ * is held by the kernel on a proc entry that has been entered by a
+ * syscall, so as long as proc entries are removed during unregistration time,
+ * then unregistration and lprocfs operations will be properly
+ * serialized.
+ */
+ struct module *pd_owner;
+ /**
+ * Bitmask of \e nrs_policy_flags
+ */
+ unsigned pd_flags;
+ /**
+ * # of references on this descriptor
+ */
+ atomic_t pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+ /**
+ * Not a valid policy state.
+ */
+ NRS_POL_STATE_INVALID,
+ /**
+ * Policies are at this state either at the start of their life, or
+ * transition here when the user selects a different policy to act
+ * as the primary one.
+ */
+ NRS_POL_STATE_STOPPED,
+ /**
+ * Policy is progress of stopping
+ */
+ NRS_POL_STATE_STOPPING,
+ /**
+ * Policy is in progress of starting
+ */
+ NRS_POL_STATE_STARTING,
+ /**
+ * A policy is in this state in two cases:
+ * - it is the fallback policy, which is always in this state.
+ * - it has been activated by the user; i.e. it is the primary policy,
+ */
+ NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+ /**
+ * Policy name
+ */
+ char pi_name[NRS_POL_NAME_MAX];
+ /**
+ * Current policy state
+ */
+ enum ptlrpc_nrs_pol_state pi_state;
+ /**
+ * # RPCs enqueued for later dispatching by the policy
+ */
+ long pi_req_queued;
+ /**
+ * # RPCs started for dispatch by the policy
+ */
+ long pi_req_started;
+ /**
+ * Is this a fallback policy?
+ */
+ unsigned pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+ /**
+ * Linkage into the NRS head's list of policies,
+ * ptlrpc_nrs:nrs_policy_list
+ */
+ struct list_head pol_list;
+ /**
+ * Linkage into the NRS head's list of policies with enqueued
+ * requests ptlrpc_nrs:nrs_policy_queued
+ */
+ struct list_head pol_list_queued;
+ /**
+ * Current state of this policy
+ */
+ enum ptlrpc_nrs_pol_state pol_state;
+ /**
+ * Bitmask of nrs_policy_flags
+ */
+ unsigned pol_flags;
+ /**
+ * # RPCs enqueued for later dispatching by the policy
+ */
+ long pol_req_queued;
+ /**
+ * # RPCs started for dispatch by the policy
+ */
+ long pol_req_started;
+ /**
+ * Usage Reference count taken on the policy instance
+ */
+ long pol_ref;
+ /**
+ * The NRS head this policy has been created at
+ */
+ struct ptlrpc_nrs *pol_nrs;
+ /**
+ * Private policy data; varies by policy type
+ */
+ void *pol_private;
+ /**
+ * Policy descriptor for this policy instance.
+ */
+ struct ptlrpc_nrs_pol_desc *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ * ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ * policies; e.g. on a policy that performs round robin or similar order
+ * scheduling across client NIDs, there would be one NRS resource per unique
+ * client NID. On a policy which performs round robin scheduling across
+ * backend filesystem objects, there would be one resource associated with
+ * each of the backend filesystem objects partaking in the scheduling
+ * performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+ /**
+ * This NRS resource's parent; is NULL for resources embedded in NRS
+ * policy instances; i.e. those are top-level ones.
+ */
+ struct ptlrpc_nrs_resource *res_parent;
+ /**
+ * The policy associated with this resource.
+ */
+ struct ptlrpc_nrs_policy *res_policy;
+};
+
+enum {
+ NRS_RES_FALLBACK,
+ NRS_RES_PRIMARY,
+ NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+ /**
+ * Resource object for policy instance.
+ */
+ struct ptlrpc_nrs_resource fh_res;
+ /**
+ * List of queued requests.
+ */
+ struct list_head fh_list;
+ /**
+ * For debugging purposes.
+ */
+ __u64 fh_sequence;
+};
+
+struct nrs_fifo_req {
+ struct list_head fr_list;
+ __u64 fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+ /**
+ * The request's resource hierarchy.
+ */
+ struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX];
+ /**
+ * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+ * policy that was used to enqueue the request.
+ *
+ * \see nrs_request_enqueue()
+ */
+ unsigned nr_res_idx;
+ unsigned nr_initialized:1;
+ unsigned nr_enqueued:1;
+ unsigned nr_started:1;
+ unsigned nr_finalized:1;
+
+ /**
+ * Policy-specific fields, used for determining a request's scheduling
+ * priority, and other supporting functionality.
+ */
+ union {
+ /**
+ * Fields for the FIFO policy
+ */
+ struct nrs_fifo_req fifo;
+ } nr_u;
+ /**
+ * Externally-registering policies may want to use this to allocate
+ * their own request properties.
+ */
+ void *ext;
+};
+
+/** @} nrs */
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+ /**
+ * Check if the lock handle of the given lock is the same as
+ * taken from the request.
+ */
+ int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+ /**
+ * Check if the request is a high priority one.
+ */
+ int (*hpreq_check)(struct ptlrpc_request *);
+ /**
+ * Called after the request has been handled.
+ */
+ void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+ /* Request type: one of PTL_RPC_MSG_* */
+ int rq_type;
+ /** Result of request processing */
+ int rq_status;
+ /**
+ * Linkage item through which this request is included into
+ * sending/delayed lists on client and into rqbd list on server
+ */
+ struct list_head rq_list;
+ /**
+ * Server side list of incoming unserved requests sorted by arrival
+ * time. Traversed from time to time to notice about to expire
+ * requests and sent back "early replies" to clients to let them
+ * know server is alive and well, just very busy to service their
+ * requests in time
+ */
+ struct list_head rq_timed_list;
+ /** server-side history, used for debugging purposes. */
+ struct list_head rq_history_list;
+ /** server-side per-export list */
+ struct list_head rq_exp_list;
+ /** server-side hp handlers */
+ struct ptlrpc_hpreq_ops *rq_ops;
+
+ /** initial thread servicing this request */
+ struct ptlrpc_thread *rq_svc_thread;
+
+ /** history sequence # */
+ __u64 rq_history_seq;
+ /** \addtogroup nrs
+ * @{
+ */
+ /** stub for NRS request */
+ struct ptlrpc_nrs_request rq_nrq;
+ /** @} nrs */
+ /** the index of service's srv_at_array into which request is linked */
+ time_t rq_at_index;
+ /** Lock to protect request flags and some other important bits, like
+ * rq_list
+ */
+ spinlock_t rq_lock;
+ /** client-side flags are serialized by rq_lock */
+ unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+ rq_timedout:1, rq_resend:1, rq_restart:1,
+ /**
+ * when ->rq_replay is set, request is kept by the client even
+ * after server commits corresponding transaction. This is
+ * used for operations that require sequence of multiple
+ * requests to be replayed. The only example currently is file
+ * open/close. When last request in such a sequence is
+ * committed, ->rq_replay is cleared on all requests in the
+ * sequence.
+ */
+ rq_replay:1,
+ rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+ rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+ rq_early:1,
+ rq_req_unlink:1, rq_reply_unlink:1,
+ rq_memalloc:1, /* req originated from "kswapd" */
+ /* server-side flags */
+ rq_packed_final:1, /* packed final reply */
+ rq_hp:1, /* high priority RPC */
+ rq_at_linked:1, /* link into service's srv_at_array */
+ rq_reply_truncate:1,
+ rq_committed:1,
+ /* whether the "rq_set" is a valid one */
+ rq_invalid_rqset:1,
+ rq_generation_set:1,
+ /* do not resend request on -EINPROGRESS */
+ rq_no_retry_einprogress:1,
+ /* allow the req to be sent if the import is in recovery
+ * status */
+ rq_allow_replay:1;
+
+ unsigned int rq_nr_resend;
+
+ enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+ enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
+ atomic_t rq_refcount;/* client-side refcount for SENT race,
+ server-side refcount for multiple replies */
+
+ /** Portal to which this request would be sent */
+ short rq_request_portal; /* XXX FIXME bug 249 */
+ /** Portal where to wait for reply and where reply would be sent */
+ short rq_reply_portal; /* XXX FIXME bug 249 */
+
+ /**
+ * client-side:
+ * !rq_truncate : # reply bytes actually received,
+ * rq_truncate : required repbuf_len for resend
+ */
+ int rq_nob_received;
+ /** Request length */
+ int rq_reqlen;
+ /** Reply length */
+ int rq_replen;
+ /** Request message - what client sent */
+ struct lustre_msg *rq_reqmsg;
+ /** Reply message - server response */
+ struct lustre_msg *rq_repmsg;
+ /** Transaction number */
+ __u64 rq_transno;
+ /** xid */
+ __u64 rq_xid;
+ /**
+ * List item to for replay list. Not yet committed requests get linked
+ * there.
+ * Also see \a rq_replay comment above.
+ */
+ struct list_head rq_replay_list;
+
+ /**
+ * security and encryption data
+ * @{ */
+ struct ptlrpc_cli_ctx *rq_cli_ctx; /**< client's half ctx */
+ struct ptlrpc_svc_ctx *rq_svc_ctx; /**< server's half ctx */
+ struct list_head rq_ctx_chain; /**< link to waited ctx */
+
+ struct sptlrpc_flavor rq_flvr; /**< for client & server */
+ enum lustre_sec_part rq_sp_from;
+
+ /* client/server security flags */
+ unsigned int
+ rq_ctx_init:1, /* context initiation */
+ rq_ctx_fini:1, /* context destroy */
+ rq_bulk_read:1, /* request bulk read */
+ rq_bulk_write:1, /* request bulk write */
+ /* server authentication flags */
+ rq_auth_gss:1, /* authenticated by gss */
+ rq_auth_remote:1, /* authed as remote user */
+ rq_auth_usr_root:1, /* authed as root */
+ rq_auth_usr_mdt:1, /* authed as mdt */
+ rq_auth_usr_ost:1, /* authed as ost */
+ /* security tfm flags */
+ rq_pack_udesc:1,
+ rq_pack_bulk:1,
+ /* doesn't expect reply FIXME */
+ rq_no_reply:1,
+ rq_pill_init:1; /* pill initialized */
+
+ uid_t rq_auth_uid; /* authed uid */
+ uid_t rq_auth_mapped_uid; /* authed uid mapped to */
+
+ /* (server side), pointed directly into req buffer */
+ struct ptlrpc_user_desc *rq_user_desc;
+
+ /* various buffer pointers */
+ struct lustre_msg *rq_reqbuf; /* req wrapper */
+ char *rq_repbuf; /* rep buffer */
+ struct lustre_msg *rq_repdata; /* rep wrapper msg */
+ struct lustre_msg *rq_clrbuf; /* only in priv mode */
+ int rq_reqbuf_len; /* req wrapper buf len */
+ int rq_reqdata_len; /* req wrapper msg len */
+ int rq_repbuf_len; /* rep buffer len */
+ int rq_repdata_len; /* rep wrapper msg len */
+ int rq_clrbuf_len; /* only in priv mode */
+ int rq_clrdata_len; /* only in priv mode */
+
+ /** early replies go to offset 0, regular replies go after that */
+ unsigned int rq_reply_off;
+
+ /** @} */
+
+ /** Fields that help to see if request and reply were swabbed or not */
+ __u32 rq_req_swab_mask;
+ __u32 rq_rep_swab_mask;
+
+ /** What was import generation when this request was sent */
+ int rq_import_generation;
+ enum lustre_imp_state rq_send_state;
+
+ /** how many early replies (for stats) */
+ int rq_early_count;
+
+ /** client+server request */
+ lnet_handle_md_t rq_req_md_h;
+ struct ptlrpc_cb_id rq_req_cbid;
+ /** optional time limit for send attempts */
+ long rq_delay_limit;
+ /** time request was first queued */
+ unsigned long rq_queued_time;
+
+ /* server-side... */
+ /** request arrival time */
+ struct timeval rq_arrival_time;
+ /** separated reply state */
+ struct ptlrpc_reply_state *rq_reply_state;
+ /** incoming request buffer */
+ struct ptlrpc_request_buffer_desc *rq_rqbd;
+
+ /** client-only incoming reply */
+ lnet_handle_md_t rq_reply_md_h;
+ wait_queue_head_t rq_reply_waitq;
+ struct ptlrpc_cb_id rq_reply_cbid;
+
+ /** our LNet NID */
+ lnet_nid_t rq_self;
+ /** Peer description (the other side) */
+ lnet_process_id_t rq_peer;
+ /** Server-side, export on which request was received */
+ struct obd_export *rq_export;
+ /** Client side, import where request is being sent */
+ struct obd_import *rq_import;
+
+ /** Replay callback, called after request is replayed at recovery */
+ void (*rq_replay_cb)(struct ptlrpc_request *);
+ /**
+ * Commit callback, called when request is committed and about to be
+ * freed.
+ */
+ void (*rq_commit_cb)(struct ptlrpc_request *);
+ /** Opaq data for replay and commit callbacks. */
+ void *rq_cb_data;
+
+ /** For bulk requests on client only: bulk descriptor */
+ struct ptlrpc_bulk_desc *rq_bulk;
+
+ /** client outgoing req */
+ /**
+ * when request/reply sent (secs), or time when request should be sent
+ */
+ time_t rq_sent;
+ /** time for request really sent out */
+ time_t rq_real_sent;
+
+ /** when request must finish. volatile
+ * so that servers' early reply updates to the deadline aren't
+ * kept in per-cpu cache */
+ volatile time_t rq_deadline;
+ /** when req reply unlink must finish. */
+ time_t rq_reply_deadline;
+ /** when req bulk unlink must finish. */
+ time_t rq_bulk_deadline;
+ /**
+ * service time estimate (secs)
+ * If the requestsis not served by this time, it is marked as timed out.
+ */
+ int rq_timeout;
+
+ /** Multi-rpc bits */
+ /** Per-request waitq introduced by bug 21938 for recovery waiting */
+ wait_queue_head_t rq_set_waitq;
+ /** Link item for request set lists */
+ struct list_head rq_set_chain;
+ /** Link back to the request set */
+ struct ptlrpc_request_set *rq_set;
+ /** Async completion handler, called when reply is received */
+ ptlrpc_interpterer_t rq_interpret_reply;
+ /** Async completion context */
+ union ptlrpc_async_args rq_async_args;
+
+ /** Pool if request is from preallocated list */
+ struct ptlrpc_request_pool *rq_pool;
+
+ struct lu_context rq_session;
+ struct lu_context rq_recov_session;
+
+ /** request format description */
+ struct req_capsule rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req, int rc)
+{
+ if (req->rq_interpret_reply != NULL) {
+ req->rq_status = req->rq_interpret_reply(env, req,
+ &req->rq_async_args,
+ rc);
+ return req->rq_status;
+ }
+ return rc;
+}
+
+/** \addtogroup nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+ struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+ /**
+ * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+ * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+ * to make sure it has not been scheduled yet (analogous to previous
+ * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+ */
+ return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+ return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+ return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+ return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+ return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+ LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+ req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+ LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+ LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+ req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+ switch (phase) {
+ case RQ_PHASE_NEW:
+ return "New";
+ case RQ_PHASE_RPC:
+ return "Rpc";
+ case RQ_PHASE_BULK:
+ return "Bulk";
+ case RQ_PHASE_INTERPRET:
+ return "Interpret";
+ case RQ_PHASE_COMPLETE:
+ return "Complete";
+ case RQ_PHASE_UNREGISTERING:
+ return "Unregistering";
+ default:
+ return "?Phase?";
+ }
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+ return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req) \
+ ptlrpc_rqphase2str(req), \
+ FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
+ FLAG(req->rq_err, "E"), \
+ FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
+ FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
+ FLAG(req->rq_no_resend, "N"), \
+ FLAG(req->rq_waiting, "W"), \
+ FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \
+ FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+ struct libcfs_debug_msg_data *data, const char *fmt, ...)
+ __printf(3, 4);
+
+/**
+ * Helper that decides if we need to print request according to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...) \
+do { \
+ CFS_CHECK_STACK(msgdata, mask, cdls); \
+ \
+ if (((mask) & D_CANTMASK) != 0 || \
+ ((libcfs_debug & (mask)) != 0 && \
+ (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \
+ _debug_req((req), msgdata, fmt, ##a); \
+} while (0)
+
+/**
+ * This is the debug print function you need to use to print request structure
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...) \
+do { \
+ if ((level) & (D_ERROR | D_WARNING)) { \
+ static struct cfs_debug_limit_state cdls; \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \
+ debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+ } else { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \
+ debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+ } \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+ /** Linkage to list of pages in a bulk */
+ struct list_head bp_link;
+ /**
+ * Number of bytes in a page to transfer starting from \a bp_pageoffset
+ */
+ int bp_buflen;
+ /** offset within a page */
+ int bp_pageoffset;
+ /** The page itself */
+ struct page *bp_page;
+};
+
+#define BULK_GET_SOURCE 0
+#define BULK_PUT_SINK 1
+#define BULK_GET_SINK 2
+#define BULK_PUT_SOURCE 3
+
+/**
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ * Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+ /** completed with failure */
+ unsigned long bd_failure:1;
+ /** {put,get}{source,sink} */
+ unsigned long bd_type:2;
+ /** client side */
+ unsigned long bd_registered:1;
+ /** For serialization with callback */
+ spinlock_t bd_lock;
+ /** Import generation when request for this bulk was sent */
+ int bd_import_generation;
+ /** LNet portal for this bulk */
+ __u32 bd_portal;
+ /** Server side - export this bulk created for */
+ struct obd_export *bd_export;
+ /** Client side - import this bulk was sent on */
+ struct obd_import *bd_import;
+ /** Back pointer to the request */
+ struct ptlrpc_request *bd_req;
+ wait_queue_head_t bd_waitq; /* server side only WQ */
+ int bd_iov_count; /* # entries in bd_iov */
+ int bd_max_iov; /* allocated size of bd_iov */
+ int bd_nob; /* # bytes covered */
+ int bd_nob_transferred; /* # bytes GOT/PUT */
+
+ __u64 bd_last_xid;
+
+ struct ptlrpc_cb_id bd_cbid; /* network callback info */
+ lnet_nid_t bd_sender; /* stash event::sender */
+ int bd_md_count; /* # valid entries in bd_mds */
+ int bd_md_max_brw; /* max entries in bd_mds */
+ /** array of associated MDs */
+ lnet_handle_md_t bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+ /*
+ * encrypt iov, size is either 0 or bd_iov_count.
+ */
+ lnet_kiov_t *bd_enc_iov;
+
+ lnet_kiov_t bd_iov[0];
+};
+
+enum {
+ SVC_STOPPED = 1 << 0,
+ SVC_STOPPING = 1 << 1,
+ SVC_STARTING = 1 << 2,
+ SVC_RUNNING = 1 << 3,
+ SVC_EVENT = 1 << 4,
+ SVC_SIGNAL = 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN 32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+ /**
+ * List of active threads in svc->srv_threads
+ */
+ struct list_head t_link;
+ /**
+ * thread-private data (preallocated memory)
+ */
+ void *t_data;
+ __u32 t_flags;
+ /**
+ * service thread index, from ptlrpc_start_threads
+ */
+ unsigned int t_id;
+ /**
+ * service thread pid
+ */
+ pid_t t_pid;
+ /**
+ * put watchdog in the structure per thread b=14840
+ *
+ * Lustre watchdog is removed for client in the hope
+ * of a generic watchdog can be merged in kernel.
+ * When that happens, we should add below back.
+ *
+ * struct lc_watchdog *t_watchdog;
+ */
+ /**
+ * the svc this thread belonged to b=18582
+ */
+ struct ptlrpc_service_part *t_svcpt;
+ wait_queue_head_t t_ctl_waitq;
+ struct lu_env *t_env;
+ char t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+ return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+ return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+ thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+ thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+ thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+ __u32 flags)
+{
+ if (thread->t_flags & flags) {
+ thread->t_flags &= ~flags;
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+ /** Link item for rqbds on a service */
+ struct list_head rqbd_list;
+ /** History of requests for this buffer */
+ struct list_head rqbd_reqs;
+ /** Back pointer to service for which this buffer is registered */
+ struct ptlrpc_service_part *rqbd_svcpt;
+ /** LNet descriptor */
+ lnet_handle_md_t rqbd_md_h;
+ int rqbd_refcount;
+ /** The buffer itself */
+ char *rqbd_buffer;
+ struct ptlrpc_cb_id rqbd_cbid;
+ /**
+ * This "embedded" request structure is only used for the
+ * last request to fit into the buffer
+ */
+ struct ptlrpc_request rqbd_req;
+};
+
+typedef int (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+ /**
+ * if non-NULL called during thread creation (ptlrpc_start_thread())
+ * to initialize service specific per-thread state.
+ */
+ int (*so_thr_init)(struct ptlrpc_thread *thr);
+ /**
+ * if non-NULL called during thread shutdown (ptlrpc_main()) to
+ * destruct state created by ->srv_init().
+ */
+ void (*so_thr_done)(struct ptlrpc_thread *thr);
+ /**
+ * Handler function for incoming requests for this service
+ */
+ int (*so_req_handler)(struct ptlrpc_request *req);
+ /**
+ * function to determine priority of the request, it's called
+ * on every new request
+ */
+ int (*so_hpreq_handler)(struct ptlrpc_request *);
+ /**
+ * service-specific print fn
+ */
+ void (*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+ /** serialize /proc operations */
+ spinlock_t srv_lock;
+ /** most often accessed fields */
+ /** chain thru all services */
+ struct list_head srv_list;
+ /** service operations table */
+ struct ptlrpc_service_ops srv_ops;
+ /** only statically allocated strings here; we don't clean them */
+ char *srv_name;
+ /** only statically allocated strings here; we don't clean them */
+ char *srv_thread_name;
+ /** service thread list */
+ struct list_head srv_threads;
+ /** threads # should be created for each partition on initializing */
+ int srv_nthrs_cpt_init;
+ /** limit of threads number for each partition */
+ int srv_nthrs_cpt_limit;
+ /** Root of /proc dir tree for this service */
+ struct proc_dir_entry *srv_procroot;
+ /** Pointer to statistic data for this service */
+ struct lprocfs_stats *srv_stats;
+ /** # hp per lp reqs to handle */
+ int srv_hpreq_ratio;
+ /** biggest request to receive */
+ int srv_max_req_size;
+ /** biggest reply to send */
+ int srv_max_reply_size;
+ /** size of individual buffers */
+ int srv_buf_size;
+ /** # buffers to allocate in 1 group */
+ int srv_nbuf_per_group;
+ /** Local portal on which to receive requests */
+ __u32 srv_req_portal;
+ /** Portal on the client to send replies to */
+ __u32 srv_rep_portal;
+ /**
+ * Tags for lu_context associated with this thread, see struct
+ * lu_context.
+ */
+ __u32 srv_ctx_tags;
+ /** soft watchdog timeout multiplier */
+ int srv_watchdog_factor;
+ /** under unregister_service */
+ unsigned srv_is_stopping:1;
+
+ /** max # request buffers in history per partition */
+ int srv_hist_nrqbds_cpt_max;
+ /** number of CPTs this service bound on */
+ int srv_ncpts;
+ /** CPTs array this service bound on */
+ __u32 *srv_cpts;
+ /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+ int srv_cpt_bits;
+ /** CPT table this service is running over */
+ struct cfs_cpt_table *srv_cptable;
+ /**
+ * partition data for ptlrpc service
+ */
+ struct ptlrpc_service_part *srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ * serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ * serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ * serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ * serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+ /** back reference to owner */
+ struct ptlrpc_service *scp_service __cfs_cacheline_aligned;
+ /* CPT id, reserved */
+ int scp_cpt;
+ /** always increasing number */
+ int scp_thr_nextid;
+ /** # of starting threads */
+ int scp_nthrs_starting;
+ /** # of stopping threads, reserved for shrinking threads */
+ int scp_nthrs_stopping;
+ /** # running threads */
+ int scp_nthrs_running;
+ /** service threads list */
+ struct list_head scp_threads;
+
+ /**
+ * serialize the following fields, used for protecting
+ * rqbd list and incoming requests waiting for preprocess,
+ * threads starting & stopping are also protected by this lock.
+ */
+ spinlock_t scp_lock __cfs_cacheline_aligned;
+ /** total # req buffer descs allocated */
+ int scp_nrqbds_total;
+ /** # posted request buffers for receiving */
+ int scp_nrqbds_posted;
+ /** in progress of allocating rqbd */
+ int scp_rqbd_allocating;
+ /** # incoming reqs */
+ int scp_nreqs_incoming;
+ /** request buffers to be reposted */
+ struct list_head scp_rqbd_idle;
+ /** req buffers receiving */
+ struct list_head scp_rqbd_posted;
+ /** incoming reqs */
+ struct list_head scp_req_incoming;
+ /** timeout before re-posting reqs, in tick */
+ long scp_rqbd_timeout;
+ /**
+ * all threads sleep on this. This wait-queue is signalled when new
+ * incoming request arrives and when difficult reply has to be handled.
+ */
+ wait_queue_head_t scp_waitq;
+
+ /** request history */
+ struct list_head scp_hist_reqs;
+ /** request buffer history */
+ struct list_head scp_hist_rqbds;
+ /** # request buffers in history */
+ int scp_hist_nrqbds;
+ /** sequence number for request */
+ __u64 scp_hist_seq;
+ /** highest seq culled from history */
+ __u64 scp_hist_seq_culled;
+
+ /**
+ * serialize the following fields, used for processing requests
+ * sent to this portal
+ */
+ spinlock_t scp_req_lock __cfs_cacheline_aligned;
+ /** # reqs in either of the NRS heads below */
+ /** # reqs being served */
+ int scp_nreqs_active;
+ /** # HPreqs being served */
+ int scp_nhreqs_active;
+ /** # hp requests handled */
+ int scp_hreq_count;
+
+ /** NRS head for regular requests */
+ struct ptlrpc_nrs scp_nrs_reg;
+ /** NRS head for HP requests; this is only valid for services that can
+ * handle HP requests */
+ struct ptlrpc_nrs *scp_nrs_hp;
+
+ /** AT stuff */
+ /** @{ */
+ /**
+ * serialize the following fields, used for changes on
+ * adaptive timeout
+ */
+ spinlock_t scp_at_lock __cfs_cacheline_aligned;
+ /** estimated rpc service time */
+ struct adaptive_timeout scp_at_estimate;
+ /** reqs waiting for replies */
+ struct ptlrpc_at_array scp_at_array;
+ /** early reply timer */
+ struct timer_list scp_at_timer;
+ /** debug */
+ unsigned long scp_at_checktime;
+ /** check early replies */
+ unsigned scp_at_check;
+ /** @} */
+
+ /**
+ * serialize the following fields, used for processing
+ * replies for this portal
+ */
+ spinlock_t scp_rep_lock __cfs_cacheline_aligned;
+ /** all the active replies */
+ struct list_head scp_rep_active;
+ /** List of free reply_states */
+ struct list_head scp_rep_idle;
+ /** waitq to run, when adding stuff to srv_free_rs_list */
+ wait_queue_head_t scp_rep_waitq;
+ /** # 'difficult' replies */
+ atomic_t scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc) \
+ for (i = 0; \
+ i < (svc)->srv_ncpts && \
+ (svc)->srv_parts != NULL && \
+ ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+ /**
+ * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+ */
+ unsigned long pc_flags;
+ /**
+ * Thread lock protecting structure fields.
+ */
+ spinlock_t pc_lock;
+ /**
+ * Start completion.
+ */
+ struct completion pc_starting;
+ /**
+ * Stop completion.
+ */
+ struct completion pc_finishing;
+ /**
+ * Thread requests set.
+ */
+ struct ptlrpc_request_set *pc_set;
+ /**
+ * Thread name used in cfs_daemonize()
+ */
+ char pc_name[16];
+ /**
+ * Environment for request interpreters to run in.
+ */
+ struct lu_env pc_env;
+ /**
+ * Index of ptlrpcd thread in the array.
+ */
+ int pc_index;
+ /**
+ * Number of the ptlrpcd's partners.
+ */
+ int pc_npartners;
+ /**
+ * Pointer to the array of partners' ptlrpcd_ctl structure.
+ */
+ struct ptlrpcd_ctl **pc_partners;
+ /**
+ * Record the partner index to be processed next.
+ */
+ int pc_cursor;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+ /**
+ * Ptlrpc thread start flag.
+ */
+ LIOD_START = 1 << 0,
+ /**
+ * Ptlrpc thread stop flag.
+ */
+ LIOD_STOP = 1 << 1,
+ /**
+ * Ptlrpc thread force flag (only stop force so far).
+ * This will cause aborting any inflight rpcs handled
+ * by thread if LIOD_STOP is specified.
+ */
+ LIOD_FORCE = 1 << 2,
+ /**
+ * This is a recovery ptlrpc thread.
+ */
+ LIOD_RECOVERY = 1 << 3,
+ /**
+ * The ptlrpcd is bound to some CPU core.
+ */
+ LIOD_BIND = 1 << 4,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+ const struct ptlrpc_nrs_pol_desc *desc)
+{
+ return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+ const struct ptlrpc_nrs_pol_desc *desc)
+{
+ LASSERT(desc->pd_compat_svc_name != NULL);
+ return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern lnet_handle_eq_t ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+ lnet_process_id_t *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(lnet_event_t *ev);
+extern void reply_in_callback(lnet_event_t *ev);
+extern void client_bulk_callback(lnet_event_t *ev);
+extern void request_in_callback(lnet_event_t *ev);
+extern void reply_out_callback(lnet_event_t *ev);
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+ lnet_nid_t self,
+ struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+ struct ptlrpc_bulk_desc *desc;
+ int rc;
+
+ LASSERT(req != NULL);
+ desc = req->rq_bulk;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+ req->rq_bulk_deadline > get_seconds())
+ return 1;
+
+ if (!desc)
+ return 0;
+
+ spin_lock(&desc->bd_lock);
+ rc = desc->bd_md_count;
+ spin_unlock(&desc->bd_lock);
+ return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY 0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force);
+
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+ struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+ void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+ set_interpreter_func fn, void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_expired_set(void *data);
+void ptlrpc_interrupted_set(void *data);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+ struct ptlrpc_request *req);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+ void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+ const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+ struct ptlrpc_request_pool *,
+ const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+ __u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+ const struct req_format *format,
+ __u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+ __u32 version, int opcode, char **bufs,
+ struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+ int opcode, int count, __u32 *lengths,
+ char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
+ __u32 version, int opcode,
+ int count, __u32 *lengths, char **bufs,
+ struct ptlrpc_request_pool *pool);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+ unsigned npages, unsigned max_brw,
+ unsigned type, unsigned portal);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+ __ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+ __ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset,
+ int len)
+{
+ __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset,
+ int len)
+{
+ __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+ struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+ int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+ /* nbufs is buffers # to allocate when growing the pool */
+ unsigned int bc_nbufs;
+ /* buffer size to post */
+ unsigned int bc_buf_size;
+ /* portal to listed for requests on */
+ unsigned int bc_req_portal;
+ /* portal of where to send replies to */
+ unsigned int bc_rep_portal;
+ /* maximum request size to be accepted for this service */
+ unsigned int bc_req_max_size;
+ /* maximum reply size this service can ever send */
+ unsigned int bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+ /* threadname should be 8 characters or less - 6 will be added on */
+ char *tc_thr_name;
+ /* threads increasing factor for each CPU */
+ unsigned int tc_thr_factor;
+ /* service threads # to start on each partition while initializing */
+ unsigned int tc_nthrs_init;
+ /*
+ * low water of threads # upper-limit on each partition while running,
+ * service availability may be impacted if threads number is lower
+ * than this value. It can be ZERO if the service doesn't require
+ * CPU affinity or there is only one partition.
+ */
+ unsigned int tc_nthrs_base;
+ /* "soft" limit for total threads number */
+ unsigned int tc_nthrs_max;
+ /* user specified threads number, it will be validated due to
+ * other members of this structure. */
+ unsigned int tc_nthrs_user;
+ /* set NUMA node affinity for service threads */
+ unsigned int tc_cpu_affinity;
+ /* Tags for lu_context associated with service thread */
+ __u32 tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+ struct cfs_cpt_table *cc_cptable;
+ /* string pattern to describe CPTs for a service */
+ char *cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+ /* service name */
+ char *psc_name;
+ /* soft watchdog timeout multiplifier to print stuck service traces */
+ unsigned int psc_watchdog_factor;
+ /* buffer information */
+ struct ptlrpc_service_buf_conf psc_buf;
+ /* thread information */
+ struct ptlrpc_service_thr_conf psc_thr;
+ /* CPU partition information */
+ struct ptlrpc_service_cpt_conf psc_cpt;
+ /* function table */
+ struct ptlrpc_service_ops psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req,
+ struct lustre_handle *lock, int mode, int no_ack);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+ struct ptlrpc_service_conf *conf,
+ struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+ struct obd_export *export);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+ int *uuid_len);
+
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+ int index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+ int index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+ char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+ __u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+ char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+ __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+ char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+ unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+int lustre_msg_hdr_size(__u32 magic, int count);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+int lustre_msg_size_v2(int count, __u32 *lengths);
+int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+int lustre_msg_buflen(struct lustre_msg *m, int n);
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len);
+int lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, int n, int max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, int version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+void lustre_msg_set_handle(struct lustre_msg *msg,
+ struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,
+ __u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+ unsigned int newlen, int move_data)
+{
+ LASSERT(req->rq_reply_state);
+ LASSERT(req->rq_repmsg);
+ req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+ newlen, move_data);
+}
+
+#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS
+
+static inline int ptlrpc_status_hton(int h)
+{
+ /*
+ * Positive errnos must be network errnos, such as LUSTRE_EDEADLK,
+ * ELDLM_LOCK_ABORTED, etc.
+ */
+ if (h < 0)
+ return -lustre_errno_hton(-h);
+ else
+ return h;
+}
+
+static inline int ptlrpc_status_ntoh(int n)
+{
+ /*
+ * See the comment in ptlrpc_status_hton().
+ */
+ if (n < 0)
+ return -lustre_errno_ntoh(-n);
+ else
+ return n;
+}
+
+#else
+
+#define ptlrpc_status_hton(h) (h)
+#define ptlrpc_status_ntoh(n) (n)
+
+#endif
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+ if (req->rq_phase == new_phase)
+ return;
+
+ if (new_phase == RQ_PHASE_UNREGISTERING) {
+ req->rq_next_phase = req->rq_phase;
+ if (req->rq_import)
+ atomic_inc(&req->rq_import->imp_unregistering);
+ }
+
+ if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+ if (req->rq_import)
+ atomic_dec(&req->rq_import->imp_unregistering);
+ }
+
+ DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+ ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+ req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > get_seconds())
+ return 0;
+ return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > get_seconds())
+ return 0;
+ return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > get_seconds())
+ return 1;
+ return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+ int rc;
+
+ spin_lock(&req->rq_lock);
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ req->rq_reply_deadline > get_seconds()) {
+ spin_unlock(&req->rq_lock);
+ return 1;
+ }
+ rc = req->rq_receiving_reply;
+ rc = rc || req->rq_req_unlink || req->rq_reply_unlink;
+ spin_unlock(&req->rq_lock);
+ return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+ if (req->rq_set == NULL)
+ wake_up(&req->rq_reply_waitq);
+ else
+ wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+ LASSERT(atomic_read(&rs->rs_refcount) > 0);
+ atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+ LASSERT(atomic_read(&rs->rs_refcount) > 0);
+ if (atomic_dec_and_test(&rs->rs_refcount))
+ lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+ if (req->rq_reply_state == NULL)
+ return; /* shouldn't occur */
+ ptlrpc_rs_decref(req->rq_reply_state);
+ req->rq_reply_state = NULL;
+ req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+ return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+ switch (req->rq_reqmsg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return req->rq_reqmsg->lm_repsize;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n",
+ req->rq_reqmsg->lm_magic);
+ return -EFAULT;
+ }
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+ if (req->rq_delay_limit != 0 &&
+ time_before(cfs_time_add(req->rq_queued_time,
+ cfs_time_seconds(req->rq_delay_limit)),
+ cfs_time_current())) {
+ return 1;
+ }
+ return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+ if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+ spin_lock(&req->rq_lock);
+ req->rq_no_resend = 1;
+ spin_unlock(&req->rq_lock);
+ }
+ return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+ int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+ return svcpt->scp_service->srv_watchdog_factor *
+ max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_rqbd != NULL);
+ return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+ struct obd_export **exp, struct obd_device *obd,
+ struct obd_uuid *cluuid, struct obd_connect_data *,
+ void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+ struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+ TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+ timeout_cb_t cb, void *data,
+ struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+ enum timeout_event event);
+struct ptlrpc_request *ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc daemon bind policy */
+typedef enum {
+ /* all ptlrpcd threads are free mode */
+ PDB_POLICY_NONE = 1,
+ /* all ptlrpcd threads are bound mode */
+ PDB_POLICY_FULL = 2,
+ /* <free1 bound1> <free2 bound2> ... <freeN boundN> */
+ PDB_POLICY_PAIR = 3,
+ /* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>,
+ * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1].
+ * If kernel supports NUMA, pthrpcd threads are binded and
+ * grouped by NUMA node */
+ PDB_POLICY_NEIGHBOR = 4,
+} pdb_policy_t;
+
+/* ptlrpc daemon load policy
+ * It is caller's duty to specify how to push the async RPC into some ptlrpcd
+ * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is
+ * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd,
+ * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner,
+ * depends on which is scheduled firstly, to accelerate the RPC processing. */
+typedef enum {
+ /* on the same CPU core as the caller */
+ PDL_POLICY_SAME = 1,
+ /* within the same CPU partition, but not the same core as the caller */
+ PDL_POLICY_LOCAL = 2,
+ /* round-robin on all CPU cores, but not the same core as the caller */
+ PDL_POLICY_ROUND = 3,
+ /* the specified CPU core is preferred, but not enforced */
+ PDL_POLICY_PREFERRED = 4,
+} pdl_policy_t;
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char *ll_opcode2str(__u32 opcode);
+#if defined (CONFIG_PROC_FS)
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h
new file mode 100644
index 000000000..ed654684c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_param.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/* For interoperability */
+struct cfg_interop_param {
+ char *old_param;
+ char *new_param;
+};
+
+/* obd_config.c */
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+ struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, __u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, __u32 net);
+/* obd_mount.c */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+ char *s1, char *s2, char *s3, char *s4);
+
+
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+ tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+ lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+ ... testfs-MDT0000.lov.stripesize=4M
+ ... testfs-OST0000.ost.client_cache_seconds=15
+ ... testfs.sys.timeout=<secs>
+ ... testfs.llite.max_read_ahead_mb=16
+*/
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT "timeout=" /* global */
+#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */
+#define PARAM_AT_MIN "at_min=" /* global */
+#define PARAM_AT_MAX "at_max=" /* global */
+#define PARAM_AT_EXTRA "at_extra=" /* global */
+#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */
+#define PARAM_AT_HISTORY "at_history=" /* global */
+#define PARAM_JOBID_VAR "jobid_var=" /* global */
+#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */
+#define PARAM_FAILNODE "failover.node=" /* add failover nid */
+#define PARAM_FAILMODE "failover.mode=" /* initial mount only */
+#define PARAM_ACTIVE "active=" /* activate/deactivate */
+#define PARAM_NETWORK "network=" /* bind on nid */
+#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST "ost."
+#define PARAM_OSC "osc."
+#define PARAM_MDT "mdt."
+#define PARAM_MDD "mdd."
+#define PARAM_MDC "mdc."
+#define PARAM_LLITE "llite."
+#define PARAM_LOV "lov."
+#define PARAM_LOD "lod."
+#define PARAM_OSP "osp."
+#define PARAM_SYS "sys." /* global */
+#define PARAM_SRPC "srpc."
+#define PARAM_SRPC_FLVR "srpc.flavor."
+#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt"
+#define PARAM_SEC "security."
+#define PARAM_QUOTA "quota." /* global */
+
+/** @} param */
+
+#endif /* _LUSTRE_PARAM_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h
new file mode 100644
index 000000000..2643f2807
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_quota.h
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+
+#include "dt_object.h"
+#include "lustre_fid.h"
+#include "lustre_dlm.h"
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+ struct lquota_glb_rec lqr_glb_rec;
+ struct lquota_slv_rec lqr_slv_rec;
+ struct lquota_acct_rec lqr_acct_rec;
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME "mdt="
+#define QUOTA_DATAPOOL_NAME "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+ /* Handle quotactl request from client. */
+ int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+ struct obd_quotactl *);
+
+ /* Handle dqacq/dqrel request from slave. */
+ int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+ struct ptlrpc_request *);
+
+ /* LDLM intent policy associated with quota locks */
+ int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+ struct ptlrpc_request *, struct ldlm_lock **,
+ int);
+
+ /* Initialize LVB of ldlm resource associated with quota objects */
+ int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+ /* Update LVB of ldlm resource associated with quota objects */
+ int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+ struct ptlrpc_request *, int);
+
+ /* Return size of LVB to be packed in ldlm message */
+ int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+ /* Fill request buffer with lvb */
+ int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+ int);
+
+ /* Free lvb associated with ldlm resource */
+ int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ * instance via qsd_init(). This creates all required structures
+ * to manage quota enforcement for this target and performs all
+ * low-level initialization which does not involve any lustre
+ * object. qsd_init() should typically be called when the OSD
+ * is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ * feature and initiates the quota reintegration procedure if
+ * needed. qsd_prepare() should typically be called when
+ * ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ * (i.e. when ->ldo_recovery_complete is called). This is used
+ * to notify the qsd layer that quota should now be enforced
+ * again via the qsd_op_begin/end functions. The last step of the
+ * reintegration procedure (namely usage reconciliation) will be
+ * completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ * qsd_init(). This releases all quota slave objects and frees the
+ * structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ * declaration of each operation. qsd_op_end() should then be
+ * invoked later once all operations have been completed in
+ * order to release/adjust the quota space.
+ * Running qsd_op_begin() before qsd_start() isn't fatal and
+ * will return success.
+ * Once qsd_start() has been run, qsd_op_begin() will block
+ * until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ * called after the operation transaction stopped.
+ * While qsd_op_begin() must be invoked each time a new
+ * operation is declared, qsd_op_end() should be called only
+ * once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined. */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+ struct proc_dir_entry *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+ struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+ struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+ union lquota_id *, int);
+/* This is exported for the ldiskfs quota migration only,
+ * see convert_quota_file() */
+int lquota_disk_write_glb(const struct lu_env *, struct dt_object *,
+ __u64, struct lquota_glb_rec *);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+ /* quota identifier */
+ union lquota_id lqi_id;
+
+ /* USRQUOTA or GRPQUOTA for now, could be expanded for
+ * directory quota or other types later. */
+ int lqi_type;
+
+ /* inodes or kbytes to be consumed or released, it could
+ * be negative when releasing space. */
+ long long lqi_space;
+
+ /* quota slave entry structure associated with this ID */
+ struct lquota_entry *lqi_qentry;
+
+ /* whether we are reporting blocks or inodes */
+ bool lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added. */
+#define QUOTA_MAX_TRANSIDS 4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+ unsigned short lqt_id_cnt;
+ struct lquota_id_info lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA 0x01
+#define QUOTA_FL_OVER_GRPQUOTA 0x02
+#define QUOTA_FL_SYNC 0x04
+
+#define IS_LQUOTA_RES(res) \
+ (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA || \
+ res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+ struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
new file mode 100644
index 000000000..c6457b27c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
@@ -0,0 +1,341 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+ RCL_CLIENT,
+ RCL_SERVER,
+ RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR 9
+
+struct req_capsule {
+ struct ptlrpc_request *rc_req;
+ const struct req_format *rc_fmt;
+ enum req_location rc_loc;
+ __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* struct ptlrpc_request, lustre_msg* */
+#include "lustre_net.h"
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+ enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc);
+int req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+ const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len);
+void *req_capsule_server_get(struct req_capsule *pill,
+ const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+ const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc, int size);
+int req_capsule_get_size(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc);
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+ enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ unsigned int newlen,
+ enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ unsigned int newlen);
+int req_layout_init(void);
+void req_layout_fini(void);
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GETSTATUS;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_UPDATE_OBJ;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_RELEASE_CLOSE;
+extern struct req_format RQF_MDS_PIN;
+extern struct req_format RQF_MDS_UNPIN;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_WRITEPAGE;
+extern struct req_format RQF_MDS_IS_SUBDIR;
+extern struct req_format RQF_MDS_DONE_WRITING;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACHECK;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QC_CALLBACK;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACHECK;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO_GENERIC;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_GETXATTR;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_format RQF_CONNECT;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+extern struct req_msg_field RMF_CLOSE_DATA;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_EAVALS;
+extern struct req_msg_field RMF_EAVALS_LENS;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_UPDATE;
+extern struct req_msg_field RMF_UPDATE_REPLY;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h
new file mode 100644
index 000000000..dff70a5b9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_sec.h
@@ -0,0 +1,1147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+/* Linux specific */
+struct key;
+struct seq_file;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech) | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+ SPTLRPC_POLICY_NULL = 0,
+ SPTLRPC_POLICY_PLAIN = 1,
+ SPTLRPC_POLICY_GSS = 2,
+ SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+ SPTLRPC_MECH_NULL = 0,
+ SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+ SPTLRPC_MECH_PLAIN = 0,
+ SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+ SPTLRPC_MECH_GSS_NULL = 0,
+ SPTLRPC_MECH_GSS_KRB5 = 1,
+ SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+ SPTLRPC_SVC_NULL = 0, /**< no security */
+ SPTLRPC_SVC_AUTH = 1, /**< authentication only */
+ SPTLRPC_SVC_INTG = 2, /**< integrity */
+ SPTLRPC_SVC_PRIV = 3, /**< privacy */
+ SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+ SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */
+ SPTLRPC_BULK_HASH = 1, /**< hash integrity */
+ SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+ SPTLRPC_BULK_SVC_NULL = 0, /**< no security */
+ SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */
+ SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */
+ SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */
+ SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET (0)
+#define FLVR_MECH_OFFSET (4)
+#define FLVR_SVC_OFFSET (8)
+#define FLVR_BULK_TYPE_OFFSET (12)
+#define FLVR_BULK_SVC_OFFSET (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \
+ (((__u32)(policy) << FLVR_POLICY_OFFSET) | \
+ ((__u32)(mech) << FLVR_MECH_OFFSET) | \
+ ((__u32)(svc) << FLVR_SVC_OFFSET) | \
+ ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \
+ ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor) \
+ ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor) \
+ ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor) \
+ ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor) \
+ ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor) \
+ ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor) \
+ ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor) \
+ ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc) \
+ ((__u32)(mech) | \
+ ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_KRB5N \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P \
+ MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL \
+ MAKE_FLVR(SPTLRPC_POLICY_NULL, \
+ SPTLRPC_MECH_NULL, \
+ SPTLRPC_SVC_NULL, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN \
+ MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \
+ SPTLRPC_MECH_PLAIN, \
+ SPTLRPC_SVC_NULL, \
+ SPTLRPC_BULK_HASH, \
+ SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5N \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_NULL, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_AUTH, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_INTG, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P \
+ MAKE_FLVR(SPTLRPC_POLICY_GSS, \
+ SPTLRPC_MECH_GSS_KRB5, \
+ SPTLRPC_SVC_PRIV, \
+ SPTLRPC_BULK_DEFAULT, \
+ SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+ LASSERT(svc < SPTLRPC_SVC_MAX);
+ *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+ SPTLRPC_FLVR_MECH(*flvr),
+ svc,
+ SPTLRPC_FLVR_BULK_TYPE(*flvr),
+ SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+ LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+ *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+ SPTLRPC_FLVR_MECH(*flvr),
+ SPTLRPC_FLVR_SVC(*flvr),
+ SPTLRPC_FLVR_BULK_TYPE(*flvr),
+ svc);
+}
+
+struct bulk_spec_hash {
+ __u8 hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+ /**
+ * wire flavor, should be renamed to sf_wire.
+ */
+ __u32 sf_rpc;
+ /**
+ * general flags of PTLRPC_SEC_FL_*
+ */
+ __u32 sf_flags;
+ /**
+ * rpc flavor specification
+ */
+ union {
+ /* nothing for now */
+ } u_rpc;
+ /**
+ * bulk flavor specification
+ */
+ union {
+ struct bulk_spec_hash hash;
+ } u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+ LUSTRE_SP_CLI = 0,
+ LUSTRE_SP_MDT,
+ LUSTRE_SP_OST,
+ LUSTRE_SP_MGC,
+ LUSTRE_SP_MGS,
+ LUSTRE_SP_ANY = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+ __u32 sr_netid; /* LNET network ID */
+ __u8 sr_from; /* sec_part */
+ __u8 sr_to; /* sec_part */
+ __u16 sr_padding;
+ struct sptlrpc_flavor sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+ int srs_nslot;
+ int srs_nrule;
+ struct sptlrpc_rule *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+ memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+ struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+ enum lustre_sec_part from,
+ enum lustre_sec_part to,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+ enum lustre_sec_part from,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD (1024)
+
+
+struct vfs_cred {
+ uint32_t vc_uid;
+ uint32_t vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+ /**
+ * To determine whether it's suitable to use the \a ctx for \a vcred.
+ */
+ int (*match) (struct ptlrpc_cli_ctx *ctx,
+ struct vfs_cred *vcred);
+
+ /**
+ * To bring the \a ctx uptodate.
+ */
+ int (*refresh) (struct ptlrpc_cli_ctx *ctx);
+
+ /**
+ * Validate the \a ctx.
+ */
+ int (*validate) (struct ptlrpc_cli_ctx *ctx);
+
+ /**
+ * Force the \a ctx to die.
+ */
+ void (*force_die) (struct ptlrpc_cli_ctx *ctx,
+ int grace);
+ int (*display) (struct ptlrpc_cli_ctx *ctx,
+ char *buf, int bufsize);
+
+ /**
+ * Sign the request message using \a ctx.
+ *
+ * \pre req->rq_reqmsg point to request message.
+ * \pre req->rq_reqlen is the request message length.
+ * \post req->rq_reqbuf point to request message with signature.
+ * \post req->rq_reqdata_len is set to the final request message size.
+ *
+ * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+ */
+ int (*sign) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Verify the reply message using \a ctx.
+ *
+ * \pre req->rq_repdata point to reply message with signature.
+ * \pre req->rq_repdata_len is the total reply message length.
+ * \post req->rq_repmsg point to reply message without signature.
+ * \post req->rq_replen is the reply message length.
+ *
+ * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+ */
+ int (*verify) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Encrypt the request message using \a ctx.
+ *
+ * \pre req->rq_reqmsg point to request message in clear text.
+ * \pre req->rq_reqlen is the request message length.
+ * \post req->rq_reqbuf point to request message.
+ * \post req->rq_reqdata_len is set to the final request message size.
+ *
+ * \see gss_cli_ctx_seal().
+ */
+ int (*seal) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Decrypt the reply message using \a ctx.
+ *
+ * \pre req->rq_repdata point to encrypted reply message.
+ * \pre req->rq_repdata_len is the total cipher text length.
+ * \post req->rq_repmsg point to reply message in clear text.
+ * \post req->rq_replen is the reply message length in clear text.
+ *
+ * \see gss_cli_ctx_unseal().
+ */
+ int (*unseal) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req);
+
+ /**
+ * Wrap bulk request data. This is called before wrapping RPC
+ * request message.
+ *
+ * \pre bulk buffer is descripted by desc->bd_iov and
+ * desc->bd_iov_count. note for read it's just buffer, no data
+ * need to be sent; for write it contains data in clear text.
+ * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+ * (usually inside of RPC request message).
+ * - encryption: cipher text bulk buffer is descripted by
+ * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+ * count remains the same).
+ * - otherwise: bulk buffer is still desc->bd_iov and
+ * desc->bd_iov_count.
+ *
+ * \return 0: success.
+ * \return -ev: error code.
+ *
+ * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+ */
+ int (*wrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+ /**
+ * Unwrap bulk reply data. This is called after wrapping RPC
+ * reply message.
+ *
+ * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+ * desc->bd_iov_count, according to wrap_bulk().
+ * \post final bulk data in clear text is placed in buffer described
+ * by desc->bd_iov and desc->bd_iov_count.
+ * \return +ve nob of actual bulk data in clear text.
+ * \return -ve error code.
+ *
+ * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+ */
+ int (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT (0) /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */
+
+#define PTLRPC_CTX_NEW (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD (1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \
+ PTLRPC_CTX_UPTODATE | \
+ PTLRPC_CTX_DEAD | \
+ PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+ struct hlist_node cc_cache; /* linked into ctx cache */
+ atomic_t cc_refcount;
+ struct ptlrpc_sec *cc_sec;
+ struct ptlrpc_ctx_ops *cc_ops;
+ unsigned long cc_expire; /* in seconds */
+ unsigned int cc_early_expire:1;
+ unsigned long cc_flags;
+ struct vfs_cred cc_vcred;
+ spinlock_t cc_lock;
+ struct list_head cc_req_list; /* waiting reqs linked here */
+ struct list_head cc_gc_chain; /* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+ /**
+ * Given an \a imp, create and initialize a ptlrpc_sec structure.
+ * \param ctx service context:
+ * - regular import: \a ctx should be NULL;
+ * - reverse import: \a ctx is obtained from incoming request.
+ * \param flavor specify what flavor to use.
+ *
+ * When necessary, policy module is responsible for taking reference
+ * on the import.
+ *
+ * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+ */
+ struct ptlrpc_sec * (*create_sec) (struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx,
+ struct sptlrpc_flavor *flavor);
+
+ /**
+ * Destructor of ptlrpc_sec. When called, refcount has been dropped
+ * to 0 and all contexts has been destroyed.
+ *
+ * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+ */
+ void (*destroy_sec) (struct ptlrpc_sec *sec);
+
+ /**
+ * Notify that this ptlrpc_sec is going to die. Optionally, policy
+ * module is supposed to set sec->ps_dying and whatever necessary
+ * actions.
+ *
+ * \see plain_kill_sec(), gss_sec_kill().
+ */
+ void (*kill_sec) (struct ptlrpc_sec *sec);
+
+ /**
+ * Given \a vcred, lookup and/or create its context. The policy module
+ * is supposed to maintain its own context cache.
+ * XXX currently \a create and \a remove_dead is always 1, perhaps
+ * should be removed completely.
+ *
+ * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+ */
+ struct ptlrpc_cli_ctx * (*lookup_ctx) (struct ptlrpc_sec *sec,
+ struct vfs_cred *vcred,
+ int create,
+ int remove_dead);
+
+ /**
+ * Called then the reference of \a ctx dropped to 0. The policy module
+ * is supposed to destroy this context or whatever else according to
+ * its cache maintenance mechanism.
+ *
+ * \param sync if zero, we shouldn't wait for the context being
+ * destroyed completely.
+ *
+ * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+ */
+ void (*release_ctx) (struct ptlrpc_sec *sec,
+ struct ptlrpc_cli_ctx *ctx,
+ int sync);
+
+ /**
+ * Flush the context cache.
+ *
+ * \param uid context of which user, -1 means all contexts.
+ * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+ * contexts should be cleared immediately.
+ * \param force if zero, only idle contexts will be flushed.
+ *
+ * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+ */
+ int (*flush_ctx_cache)
+ (struct ptlrpc_sec *sec,
+ uid_t uid,
+ int grace,
+ int force);
+
+ /**
+ * Called periodically by garbage collector to remove dead contexts
+ * from cache.
+ *
+ * \see gss_sec_gc_ctx_kr().
+ */
+ void (*gc_ctx) (struct ptlrpc_sec *sec);
+
+ /**
+ * Given an context \a ctx, install a corresponding reverse service
+ * context on client side.
+ * XXX currently it's only used by GSS module, maybe we should remove
+ * this from general API.
+ */
+ int (*install_rctx)(struct obd_import *imp,
+ struct ptlrpc_sec *sec,
+ struct ptlrpc_cli_ctx *ctx);
+
+ /**
+ * To allocate request buffer for \a req.
+ *
+ * \pre req->rq_reqmsg == NULL.
+ * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+ * we are not supposed to free it.
+ * \post if success, req->rq_reqmsg point to a buffer with size
+ * at least \a lustre_msg_size.
+ *
+ * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+ */
+ int (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int lustre_msg_size);
+
+ /**
+ * To free request buffer for \a req.
+ *
+ * \pre req->rq_reqbuf != NULL.
+ *
+ * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+ */
+ void (*free_reqbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req);
+
+ /**
+ * To allocate reply buffer for \a req.
+ *
+ * \pre req->rq_repbuf == NULL.
+ * \post if success, req->rq_repbuf point to a buffer with size
+ * req->rq_repbuf_len, the size should be large enough to receive
+ * reply which be transformed from \a lustre_msg_size of clear text.
+ *
+ * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+ */
+ int (*alloc_repbuf)(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int lustre_msg_size);
+
+ /**
+ * To free reply buffer for \a req.
+ *
+ * \pre req->rq_repbuf != NULL.
+ * \post req->rq_repbuf == NULL.
+ * \post req->rq_repbuf_len == 0.
+ *
+ * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+ */
+ void (*free_repbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req);
+
+ /**
+ * To expand the request buffer of \a req, thus the \a segment in
+ * the request message pointed by req->rq_reqmsg can accommodate
+ * at least \a newsize of data.
+ *
+ * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+ *
+ * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+ * gss_enlarge_reqbuf().
+ */
+ int (*enlarge_reqbuf)
+ (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int segment, int newsize);
+ /*
+ * misc
+ */
+ int (*display) (struct ptlrpc_sec *sec,
+ struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+ /**
+ * verify an incoming request.
+ *
+ * \pre request message is pointed by req->rq_reqbuf, size is
+ * req->rq_reqdata_len; and the message has been unpacked to
+ * host byte order.
+ *
+ * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+ * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+ * req->rq_sp_from is decoded from request.
+ * \retval SECSVC_COMPLETE success, the request has been fully
+ * processed, and reply message has been prepared; req->rq_sp_from is
+ * decoded from request.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ *
+ * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+ */
+ int (*accept) (struct ptlrpc_request *req);
+
+ /**
+ * Perform security transformation upon reply message.
+ *
+ * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+ * is req->rq_replen.
+ * \post req->rs_repdata_len is the final message size.
+ * \post req->rq_reply_off is set.
+ *
+ * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+ */
+ int (*authorize) (struct ptlrpc_request *req);
+
+ /**
+ * Invalidate server context \a ctx.
+ *
+ * \see gss_svc_invalidate_ctx().
+ */
+ void (*invalidate_ctx)
+ (struct ptlrpc_svc_ctx *ctx);
+
+ /**
+ * Allocate a ptlrpc_reply_state.
+ *
+ * \param msgsize size of the reply message in clear text.
+ * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+ * should simply use it; otherwise we'll responsible for allocating
+ * a new one.
+ * \post req->rq_reply_state != NULL;
+ * \post req->rq_reply_state->rs_msg != NULL;
+ *
+ * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+ */
+ int (*alloc_rs) (struct ptlrpc_request *req,
+ int msgsize);
+
+ /**
+ * Free a ptlrpc_reply_state.
+ */
+ void (*free_rs) (struct ptlrpc_reply_state *rs);
+
+ /**
+ * Release the server context \a ctx.
+ *
+ * \see gss_svc_free_ctx().
+ */
+ void (*free_ctx) (struct ptlrpc_svc_ctx *ctx);
+
+ /**
+ * Install a reverse context based on the server context \a ctx.
+ *
+ * \see gss_svc_install_rctx_kr().
+ */
+ int (*install_rctx)(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx);
+
+ /**
+ * Prepare buffer for incoming bulk write.
+ *
+ * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+ * intended to receive the write.
+ *
+ * \see gss_svc_prep_bulk().
+ */
+ int (*prep_bulk) (struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+ /**
+ * Unwrap the bulk write data.
+ *
+ * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+ */
+ int (*unwrap_bulk) (struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+ /**
+ * Wrap the bulk read data.
+ *
+ * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+ */
+ int (*wrap_bulk) (struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+ struct module *sp_owner;
+ char *sp_name;
+ __u16 sp_policy; /* policy number */
+ struct ptlrpc_sec_cops *sp_cops; /* client ops */
+ struct ptlrpc_sec_sops *sp_sops; /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+ struct ptlrpc_sec_policy *ps_policy;
+ atomic_t ps_refcount;
+ /** statistic only */
+ atomic_t ps_nctx;
+ /** unique identifier */
+ int ps_id;
+ struct sptlrpc_flavor ps_flvr;
+ enum lustre_sec_part ps_part;
+ /** after set, no more new context will be created */
+ unsigned int ps_dying:1;
+ /** owning import */
+ struct obd_import *ps_import;
+ spinlock_t ps_lock;
+
+ /*
+ * garbage collection
+ */
+ struct list_head ps_gc_list;
+ unsigned long ps_gc_interval; /* in seconds */
+ unsigned long ps_gc_next; /* in seconds */
+};
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+ return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+ return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+ atomic_t sc_refcount;
+ struct ptlrpc_sec_policy *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS (128)
+
+struct ptlrpc_user_desc {
+ __u32 pud_uid;
+ __u32 pud_gid;
+ __u32 pud_fsuid;
+ __u32 pud_fsgid;
+ __u32 pud_cap;
+ __u32 pud_ngroups;
+ __u32 pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+ BULK_HASH_ALG_NULL = 0,
+ BULK_HASH_ALG_ADLER32,
+ BULK_HASH_ALG_CRC32,
+ BULK_HASH_ALG_MD5,
+ BULK_HASH_ALG_SHA1,
+ BULK_HASH_ALG_SHA256,
+ BULK_HASH_ALG_SHA384,
+ BULK_HASH_ALG_SHA512,
+ BULK_HASH_ALG_MAX
+};
+
+const char *sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+ BSD_FL_ERR = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+ __u8 bsd_version; /* 0 */
+ __u8 bsd_type; /* SPTLRPC_BULK_XXX */
+ __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */
+ __u8 bsd_flags; /* flags */
+ __u32 bsd_nob; /* nob of bulk data */
+ __u8 bsd_data[0]; /* policy-specific token */
+};
+
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+ size--;
+ size |= size >> 1;
+ size |= size >> 2;
+ size |= size >> 4;
+ size |= size >> 8;
+ size |= size >> 16;
+ size++;
+ return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+ int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+ char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline
+struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+ __module_get(policy->sp_owner);
+ return policy;
+}
+
+static inline
+void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+ module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+ return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+ return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+ return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+ return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy implementation
+ */
+int sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+ int segment, int newsize);
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+ struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx,
+ struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char *sec2target_str(struct ptlrpc_sec *sec);
+/*
+ * lprocfs
+ */
+#if defined (CONFIG_PROC_FS)
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+#else
+#define sptlrpc_proc_root NULL
+static inline int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{ return 0; }
+#endif
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+ SECSVC_OK = 0,
+ SECSVC_COMPLETE,
+ SECSVC_DROP,
+};
+
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int sptlrpc_target_export_check(struct obd_export *exp,
+ struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+ struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+ struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc,
+ int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc);
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+ void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+ return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+enum {
+ LUSTRE_SEC_NONE = 0,
+ LUSTRE_SEC_REMOTE = 1,
+ LUSTRE_SEC_SPECIFY = 2,
+ LUSTRE_SEC_ALL = 3
+};
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h
new file mode 100644
index 000000000..caa4da12f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/lustre_ver.h
@@ -0,0 +1,26 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+/* This file automatically generated from lustre/include/lustre_ver.h.in,
+ * based on parameters in lustre/autoconf/lustre-version.ac.
+ * Changes made directly to this file will be lost. */
+
+#define LUSTRE_MAJOR 2
+#define LUSTRE_MINOR 3
+#define LUSTRE_PATCH 64
+#define LUSTRE_FIX 0
+#define LUSTRE_VERSION_STRING "2.3.64"
+
+#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR, \
+ LUSTRE_MINOR, LUSTRE_PATCH, \
+ LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h
new file mode 100644
index 000000000..2a88b806f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -0,0 +1,1496 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include "linux/obd.h"
+
+#define IOC_OSC_TYPE 'h'
+#define IOC_OSC_MIN_NR 20
+#define IOC_OSC_SET_ACTIVE _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
+#define IOC_OSC_MAX_NR 50
+
+#define IOC_MDC_TYPE 'i'
+#define IOC_MDC_MIN_NR 20
+#define IOC_MDC_MAX_NR 50
+
+#include "lustre/lustre_idl.h"
+#include "lustre_lib.h"
+#include "lu_ref.h"
+#include "lustre_export.h"
+#include "lustre_fid.h"
+#include "lustre_fld.h"
+#include "lustre_capa.h"
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+ int ar_rc;
+ int ar_force_sync;
+ __u64 ar_min_xid;
+};
+
+struct lov_oinfo { /* per-stripe data structure */
+ struct ost_id loi_oi; /* object ID/Sequence on the target OST */
+ int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */
+ int loi_ost_gen; /* generation of this loi_ost_idx */
+
+ unsigned long loi_kms_valid:1;
+ __u64 loi_kms; /* known minimum size */
+ struct ost_lvb loi_lvb;
+ struct osc_async_rc loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+ oinfo->loi_kms = kms;
+ oinfo->loi_kms_valid = 1;
+}
+
+static inline void loi_init(struct lov_oinfo *loi)
+{
+}
+
+struct lov_stripe_md {
+ atomic_t lsm_refc;
+ spinlock_t lsm_lock;
+ pid_t lsm_lock_owner; /* debugging */
+
+ /* maximum possible file size, might change as OSTs status changes,
+ * e.g. disconnected, deactivated */
+ __u64 lsm_maxbytes;
+ struct {
+ /* Public members. */
+ struct ost_id lw_object_oi; /* lov object id/seq */
+
+ /* LOV-private members start here -- only for use in lov/. */
+ __u32 lw_magic;
+ __u32 lw_stripe_size; /* size of the stripe */
+ __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */
+ __u16 lw_stripe_count; /* number of objects being striped over */
+ __u16 lw_layout_gen; /* generation of the layout */
+ char lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+ } lsm_wire;
+
+ struct lov_oinfo *lsm_oinfo[0];
+};
+
+#define lsm_oi lsm_wire.lw_object_oi
+#define lsm_magic lsm_wire.lw_magic
+#define lsm_layout_gen lsm_wire.lw_layout_gen
+#define lsm_stripe_size lsm_wire.lw_stripe_size
+#define lsm_pattern lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name lsm_wire.lw_pool_name
+
+static inline bool lsm_is_released(struct lov_stripe_md *lsm)
+{
+ return !!(lsm->lsm_pattern & LOV_PATTERN_F_RELEASED);
+}
+
+static inline bool lsm_has_objects(struct lov_stripe_md *lsm)
+{
+ if (lsm == NULL)
+ return false;
+ if (lsm_is_released(lsm))
+ return false;
+ return true;
+}
+
+static inline int lov_stripe_md_size(unsigned int stripe_count)
+{
+ struct lov_stripe_md lsm;
+
+ return sizeof(lsm) + stripe_count * sizeof(lsm.lsm_oinfo[0]);
+}
+
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+ /* Lock policy. It keeps an extent which is specific for a particular
+ * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+ * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+ ldlm_policy_data_t oi_policy;
+ /* Flags used for set request specific flags:
+ - while lock handling, the flags obtained on the enqueue
+ request are set here.
+ - while stats, the flags used for control delay/resend.
+ - while setattr, the flags used for distinguish punch operation
+ */
+ __u64 oi_flags;
+ /* Lock handle specific for every OSC lock. */
+ struct lustre_handle *oi_lockh;
+ /* lsm data specific for every OSC. */
+ struct lov_stripe_md *oi_md;
+ /* obdo data specific for every OSC, if needed at all. */
+ struct obdo *oi_oa;
+ /* statfs data specific for every OSC, if needed at all. */
+ struct obd_statfs *oi_osfs;
+ /* An update callback which is called to update some data on upper
+ * level. E.g. it is used for update lsm->lsm_oinfo at every received
+ * request in osc level for enqueue requests. It is also possible to
+ * update some caller data from LOV layer if needed. */
+ obd_enqueue_update_f oi_cb_up;
+ /* oss capability, its type is obd_capa in client to avoid copy.
+ * in contrary its type is lustre_capa in OSS. */
+ void *oi_capa;
+ /* transfer jobid from ost_sync() to filter_sync()... */
+ char *oi_jobid;
+};
+
+/* compare all relevant fields. */
+static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
+ struct lov_stripe_md *m2)
+{
+ /*
+ * ->lsm_wire contains padding, but it should be zeroed out during
+ * allocation.
+ */
+ return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof(m1->lsm_wire));
+}
+
+static inline int lov_lum_lsm_cmp(struct lov_user_md *lum,
+ struct lov_stripe_md *lsm)
+{
+ if (lsm->lsm_magic != lum->lmm_magic)
+ return 1;
+ if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) &&
+ (lsm->lsm_stripe_count != lum->lmm_stripe_count))
+ return 2;
+ if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) &&
+ (lsm->lsm_stripe_size != lum->lmm_stripe_size))
+ return 3;
+ if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) &&
+ (lsm->lsm_pattern != lum->lmm_pattern))
+ return 4;
+ if ((lsm->lsm_magic == LOV_MAGIC_V3) &&
+ (strncmp(lsm->lsm_pool_name,
+ ((struct lov_user_md_v3 *)lum)->lmm_pool_name,
+ LOV_MAXPOOLNAME) != 0))
+ return 5;
+ return 0;
+}
+
+static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3,
+ int *lmm_magic,
+ struct lov_user_md *lum)
+{
+ if (lum && copy_from_user(lumv3, lum, sizeof(struct lov_user_md_v1)))
+ return -EFAULT;
+
+ *lmm_magic = lumv3->lmm_magic;
+
+ if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+ lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3);
+ *lmm_magic = LOV_USER_MAGIC_V1;
+ } else if (*lmm_magic == LOV_USER_MAGIC_V3) {
+ if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+ return -EFAULT;
+ } else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+ if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+ return -EFAULT;
+ lustre_swab_lov_user_md_v3(lumv3);
+ *lmm_magic = LOV_USER_MAGIC_V3;
+ } else if (*lmm_magic != LOV_USER_MAGIC_V1) {
+ CDEBUG(D_IOCTL,
+ "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+ *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+
+struct obd_type {
+ struct list_head typ_chain;
+ struct obd_ops *typ_dt_ops;
+ struct md_ops *typ_md_ops;
+ struct proc_dir_entry *typ_procroot;
+ char *typ_name;
+ int typ_refcnt;
+ struct lu_device_type *typ_lu;
+ spinlock_t obd_type_lock;
+};
+
+struct brw_page {
+ u64 off;
+ struct page *pg;
+ int count;
+ u32 flag;
+};
+
+/* llog contexts */
+enum llog_ctxt_id {
+ LLOG_CONFIG_ORIG_CTXT = 0,
+ LLOG_CONFIG_REPL_CTXT,
+ LLOG_MDS_OST_ORIG_CTXT,
+ LLOG_MDS_OST_REPL_CTXT,
+ LLOG_SIZE_ORIG_CTXT,
+ LLOG_SIZE_REPL_CTXT,
+ LLOG_RD1_ORIG_CTXT,
+ LLOG_RD1_REPL_CTXT,
+ LLOG_TEST_ORIG_CTXT,
+ LLOG_TEST_REPL_CTXT,
+ LLOG_LOVEA_ORIG_CTXT,
+ LLOG_LOVEA_REPL_CTXT,
+ LLOG_CHANGELOG_ORIG_CTXT, /**< changelog generation on mdd */
+ LLOG_CHANGELOG_REPL_CTXT, /**< changelog access on clients */
+ LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */
+ LLOG_AGENT_ORIG_CTXT, /**< agent requests generation on cdt */
+ LLOG_MAX_CTXTS
+};
+
+struct timeout_item {
+ enum timeout_event ti_event;
+ unsigned long ti_timeout;
+ timeout_cb_t ti_cb;
+ void *ti_cb_data;
+ struct list_head ti_obd_list;
+ struct list_head ti_chain;
+};
+
+#define OSC_MAX_RIF_DEFAULT 8
+#define MDS_OSC_MAX_RIF_DEFAULT 50
+#define OSC_MAX_RIF_MAX 256
+#define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS 10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+ NEVER_SYNC_ON_CANCEL = 0,
+ BLOCKING_SYNC_ON_CANCEL = 1,
+ ALWAYS_SYNC_ON_CANCEL = 2,
+ NUM_SYNC_ON_CANCEL_STATES
+};
+
+#define MDC_MAX_RIF_DEFAULT 8
+#define MDC_MAX_RIF_MAX 512
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+ struct rw_semaphore cl_sem;
+ struct obd_uuid cl_target_uuid;
+ struct obd_import *cl_import; /* ptlrpc connection state */
+ int cl_conn_count;
+ /* max_mds_easize is purely a performance thing so we don't have to
+ * call obd_size_diskmd() all the time. */
+ int cl_default_mds_easize;
+ int cl_max_mds_easize;
+ int cl_default_mds_cookiesize;
+ int cl_max_mds_cookiesize;
+
+ enum lustre_sec_part cl_sp_me;
+ enum lustre_sec_part cl_sp_to;
+ struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */
+
+ /* the grant values are protected by loi_list_lock below */
+ long cl_dirty; /* all _dirty_ in bytes */
+ long cl_dirty_max; /* allowed w/o rpc */
+ long cl_dirty_transit; /* dirty synchronous */
+ long cl_avail_grant; /* bytes of credit for ost */
+ long cl_lost_grant; /* lost credits (trunc) */
+
+ /* since we allocate grant by blocks, we don't know how many grant will
+ * be used to add a page into cache. As a solution, we reserve maximum
+ * grant before trying to dirty a page and unreserve the rest.
+ * See osc_{reserve|unreserve}_grant for details. */
+ long cl_reserved_grant;
+ struct list_head cl_cache_waiters; /* waiting for cache/grant */
+ unsigned long cl_next_shrink_grant; /* jiffies */
+ struct list_head cl_grant_shrink_list; /* Timeout event list */
+ int cl_grant_shrink_interval; /* seconds */
+
+ /* A chunk is an optimal size used by osc_extent to determine
+ * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */
+ int cl_chunkbits;
+ int cl_chunk;
+ int cl_extent_tax; /* extent overhead, by bytes */
+
+ /* keep track of objects that have lois that contain pages which
+ * have been queued for async brw. this lock also protects the
+ * lists of osc_client_pages that hang off of the loi */
+ /*
+ * ->cl_loi_list_lock protects consistency of
+ * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+ * ->ap_completion() call-backs are executed under this lock. As we
+ * cannot guarantee that these call-backs never block on all platforms
+ * (as a matter of fact they do block on Mac OS X), type of
+ * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+ * and blocking mutex on Mac OS X. (Alternative is to make this lock
+ * blocking everywhere, but we don't want to slow down fast-path of
+ * our main platform.)
+ *
+ * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together
+ * with client_obd_list_{un,}lock() and
+ * client_obd_list_lock_{init,done}() functions.
+ *
+ * NB by Jinshan: though field names are still _loi_, but actually
+ * osc_object{}s are in the list.
+ */
+ client_obd_lock_t cl_loi_list_lock;
+ struct list_head cl_loi_ready_list;
+ struct list_head cl_loi_hp_ready_list;
+ struct list_head cl_loi_write_list;
+ struct list_head cl_loi_read_list;
+ int cl_r_in_flight;
+ int cl_w_in_flight;
+ /* just a sum of the loi/lop pending numbers to be exported by /proc */
+ atomic_t cl_pending_w_pages;
+ atomic_t cl_pending_r_pages;
+ __u32 cl_max_pages_per_rpc;
+ int cl_max_rpcs_in_flight;
+ struct obd_histogram cl_read_rpc_hist;
+ struct obd_histogram cl_write_rpc_hist;
+ struct obd_histogram cl_read_page_hist;
+ struct obd_histogram cl_write_page_hist;
+ struct obd_histogram cl_read_offset_hist;
+ struct obd_histogram cl_write_offset_hist;
+
+ /* lru for osc caching pages */
+ struct cl_client_cache *cl_cache;
+ struct list_head cl_lru_osc; /* member of cl_cache->ccc_lru */
+ atomic_t *cl_lru_left;
+ atomic_t cl_lru_busy;
+ atomic_t cl_lru_shrinkers;
+ atomic_t cl_lru_in_list;
+ struct list_head cl_lru_list; /* lru page list */
+ client_obd_lock_t cl_lru_list_lock; /* page list protector */
+
+ /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+ atomic_t cl_destroy_in_flight;
+ wait_queue_head_t cl_destroy_waitq;
+
+ struct mdc_rpc_lock *cl_rpc_lock;
+ struct mdc_rpc_lock *cl_close_lock;
+
+ /* mgc datastruct */
+ struct mutex cl_mgc_mutex;
+ struct local_oid_storage *cl_mgc_los;
+ struct dt_object *cl_mgc_configs_dir;
+ atomic_t cl_mgc_refcount;
+ struct obd_export *cl_mgc_mgsexp;
+
+ /* checksumming for data sent over the network */
+ unsigned int cl_checksum:1; /* 0 = disabled, 1 = enabled */
+ /* supported checksum types that are worked out at connect time */
+ __u32 cl_supp_cksum_types;
+ /* checksum algorithm to be used */
+ cksum_type_t cl_cksum_type;
+
+ /* also protected by the poorly named _loi_list_lock lock above */
+ struct osc_async_rc cl_ar;
+
+ /* used by quotacheck when the servers are older than 2.4 */
+ int cl_qchk_stat; /* quotacheck stat of the peer */
+#define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0)
+#warning "please consider removing quotacheck compatibility code"
+#endif
+
+ /* sequence manager */
+ struct lu_client_seq *cl_seq;
+
+ atomic_t cl_resends; /* resend count */
+
+ /* ptlrpc work for writeback in ptlrpcd context */
+ void *cl_writeback_work;
+ /* hash tables for osc_quota_info */
+ struct cfs_hash *cl_quota_hash[MAXQUOTAS];
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+ __u32 idx;
+ u64 *data;
+};
+
+struct echo_client_obd {
+ struct obd_export *ec_exp; /* the local connection to osc/lov */
+ spinlock_t ec_lock;
+ struct list_head ec_objects;
+ struct list_head ec_locks;
+ int ec_nstripes;
+ __u64 ec_unique;
+};
+
+struct lov_qos_oss {
+ struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */
+ struct list_head lqo_oss_list; /* link to lov_qos */
+ __u64 lqo_bavail; /* total bytes avail on OSS */
+ __u64 lqo_penalty; /* current penalty */
+ __u64 lqo_penalty_per_obj;/* penalty decrease every obj*/
+ time_t lqo_used; /* last used time, seconds */
+ __u32 lqo_ost_count; /* number of osts on this oss */
+};
+
+struct ltd_qos {
+ struct lov_qos_oss *ltq_oss; /* oss info */
+ __u64 ltq_penalty; /* current penalty */
+ __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/
+ __u64 ltq_weight; /* net weighting */
+ time_t ltq_used; /* last used time, seconds */
+ unsigned int ltq_usable:1; /* usable for striping */
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+ __u32 *op_array; /* array of index of
+ lov_obd->lov_tgts */
+ unsigned int op_count; /* number of OSTs in the array */
+ unsigned int op_size; /* allocated size of lp_array */
+ struct rw_semaphore op_rw_sem; /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+ __u32 lqr_start_idx; /* start index of new inode */
+ __u32 lqr_offset_idx; /* aliasing for start_idx */
+ int lqr_start_count; /* reseed counter */
+ struct ost_pool lqr_pool; /* round-robin optimized list */
+ unsigned long lqr_dirty:1; /* recalc round-robin list */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_statfs_data {
+ struct obd_info lsd_oi;
+ struct obd_statfs lsd_statfs;
+};
+/* Stripe placement optimization */
+struct lov_qos {
+ struct list_head lq_oss_list; /* list of OSSs that targets use */
+ struct rw_semaphore lq_rw_sem;
+ __u32 lq_active_oss_count;
+ unsigned int lq_prio_free; /* priority for free space */
+ unsigned int lq_threshold_rr;/* priority for rr */
+ struct lov_qos_rr lq_rr; /* round robin qos data */
+ unsigned long lq_dirty:1, /* recalc qos data */
+ lq_same_space:1,/* the ost's all have approx.
+ the same space avail */
+ lq_reset:1, /* zero current penalties */
+ lq_statfs_in_progress:1; /* statfs op in
+ progress */
+ /* qos statfs data */
+ struct lov_statfs_data *lq_statfs_data;
+ wait_queue_head_t lq_statfs_waitq; /* waitqueue to notify statfs
+ * requests completion */
+};
+
+struct lov_tgt_desc {
+ struct list_head ltd_kill;
+ struct obd_uuid ltd_uuid;
+ struct obd_device *ltd_obd;
+ struct obd_export *ltd_exp;
+ struct ltd_qos ltd_qos; /* qos info per target */
+ __u32 ltd_gen;
+ __u32 ltd_index; /* index in lov_obd->tgts */
+ unsigned long ltd_active:1,/* is this target up for requests */
+ ltd_activate:1,/* should target be activated */
+ ltd_reap:1; /* should this target be deleted */
+};
+
+/* Pool metadata */
+#define pool_tgt_size(_p) _p->pool_obds.op_size
+#define pool_tgt_count(_p) _p->pool_obds.op_count
+#define pool_tgt_array(_p) _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+
+struct pool_desc {
+ char pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+ struct ost_pool pool_obds; /* pool members */
+ atomic_t pool_refcount; /* pool ref. counter */
+ struct lov_qos_rr pool_rr; /* round robin qos */
+ struct hlist_node pool_hash; /* access by poolname */
+ struct list_head pool_list; /* serial access */
+ struct proc_dir_entry *pool_proc_entry; /* file in /proc */
+ struct obd_device *pool_lobd; /* obd of the lov/lod to which
+ * this pool belongs */
+};
+
+struct lov_obd {
+ struct lov_desc desc;
+ struct lov_tgt_desc **lov_tgts; /* sparse array */
+ struct ost_pool lov_packed; /* all OSTs in a packed
+ array */
+ struct mutex lov_lock;
+ struct obd_connect_data lov_ocd;
+ atomic_t lov_refcount;
+ __u32 lov_tgt_count; /* how many OBD's */
+ __u32 lov_active_tgt_count; /* how many active */
+ __u32 lov_death_row;/* tgts scheduled to be deleted */
+ __u32 lov_tgt_size; /* size of tgts array */
+ int lov_connects;
+ int lov_pool_count;
+ struct cfs_hash *lov_pools_hash_body; /* used for key access */
+ struct list_head lov_pool_list; /* used for sequential access */
+ struct proc_dir_entry *lov_pool_proc_entry;
+ enum lustre_sec_part lov_sp_me;
+
+ /* Cached LRU pages from upper layer */
+ void *lov_cache;
+
+ struct rw_semaphore lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+ struct obd_uuid ltd_uuid;
+ struct obd_export *ltd_exp;
+ int ltd_idx;
+ struct mutex ltd_fid_mutex;
+ unsigned long ltd_active:1; /* target up for requests */
+};
+
+enum placement_policy {
+ PLACEMENT_CHAR_POLICY = 0,
+ PLACEMENT_NID_POLICY = 1,
+ PLACEMENT_INVAL_POLICY = 2,
+ PLACEMENT_MAX_POLICY
+};
+
+struct lmv_obd {
+ int refcount;
+ struct lu_client_fld lmv_fld;
+ spinlock_t lmv_lock;
+ enum placement_policy lmv_placement;
+ struct lmv_desc desc;
+ struct obd_uuid cluuid;
+ struct obd_export *exp;
+
+ struct mutex init_mutex;
+ int connected;
+ int max_easize;
+ int max_def_easize;
+ int max_cookiesize;
+ int max_def_cookiesize;
+ int server_timeout;
+
+ int tgts_size; /* size of tgts array */
+ struct lmv_tgt_desc **tgts;
+
+ struct obd_connect_data conn_data;
+};
+
+struct niobuf_local {
+ __u64 lnb_file_offset;
+ __u32 lnb_page_offset;
+ __u32 len;
+ __u32 flags;
+ struct page *page;
+ struct dentry *dentry;
+ int lnb_grant_used;
+ int rc;
+};
+
+#define LUSTRE_FLD_NAME "fld"
+#define LUSTRE_SEQ_NAME "seq"
+
+#define LUSTRE_MDD_NAME "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME "osd-zfs"
+#define LUSTRE_VVP_NAME "vvp"
+#define LUSTRE_LMV_NAME "lmv"
+#define LUSTRE_SLP_NAME "slp"
+#define LUSTRE_LOD_NAME "lod"
+#define LUSTRE_OSP_NAME "osp"
+#define LUSTRE_LWP_NAME "lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME "mds"
+#define LUSTRE_MDT_NAME "mdt"
+#define LUSTRE_MDC_NAME "mdc"
+#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */
+#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME "osc"
+#define LUSTRE_LOV_NAME "lov"
+#define LUSTRE_MGS_NAME "mgs"
+#define LUSTRE_MGC_NAME "mgc"
+
+#define LUSTRE_ECHO_NAME "obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
+struct obd_trans_info {
+ __u64 oti_transno;
+ __u64 oti_xid;
+ /* Only used on the server side for tracking acks. */
+ struct oti_req_ack_lock {
+ struct lustre_handle lock;
+ __u32 mode;
+ } oti_ack_locks[4];
+ void *oti_handle;
+ struct llog_cookie oti_onecookie;
+ struct llog_cookie *oti_logcookies;
+ int oti_numcookies;
+ /** synchronous write is needed */
+ unsigned long oti_sync_write:1;
+
+ /* initial thread handling transaction */
+ struct ptlrpc_thread *oti_thread;
+ __u32 oti_conn_cnt;
+ /** VBR: versions */
+ __u64 oti_pre_version;
+ /** JobID */
+ char *oti_jobid;
+
+ struct obd_uuid *oti_ost_uuid;
+};
+
+static inline void oti_init(struct obd_trans_info *oti,
+ struct ptlrpc_request *req)
+{
+ if (oti == NULL)
+ return;
+ memset(oti, 0, sizeof(*oti));
+
+ if (req == NULL)
+ return;
+
+ oti->oti_xid = req->rq_xid;
+ /** VBR: take versions from request */
+ if (req->rq_reqmsg != NULL &&
+ lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+ __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+
+ oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+ oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+ }
+
+ /** called from mds_create_objects */
+ if (req->rq_repmsg != NULL)
+ oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+ oti->oti_thread = req->rq_svc_thread;
+ if (req->rq_reqmsg != NULL)
+ oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+}
+
+static inline void oti_alloc_cookies(struct obd_trans_info *oti,
+ int num_cookies)
+{
+ if (!oti)
+ return;
+
+ if (num_cookies == 1)
+ oti->oti_logcookies = &oti->oti_onecookie;
+ else
+ OBD_ALLOC_LARGE(oti->oti_logcookies,
+ num_cookies * sizeof(oti->oti_onecookie));
+
+ oti->oti_numcookies = num_cookies;
+}
+
+static inline void oti_free_cookies(struct obd_trans_info *oti)
+{
+ if (!oti || !oti->oti_logcookies)
+ return;
+
+ if (oti->oti_logcookies == &oti->oti_onecookie)
+ LASSERT(oti->oti_numcookies == 1);
+ else
+ OBD_FREE_LARGE(oti->oti_logcookies,
+ oti->oti_numcookies*sizeof(oti->oti_onecookie));
+ oti->oti_logcookies = NULL;
+ oti->oti_numcookies = 0;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+ /* target added */
+ OBD_NOTIFY_CREATE,
+ /* Device connect start */
+ OBD_NOTIFY_CONNECT,
+ /* Device activated */
+ OBD_NOTIFY_ACTIVE,
+ /* Device deactivated */
+ OBD_NOTIFY_INACTIVE,
+ /* Device disconnected */
+ OBD_NOTIFY_DISCON,
+ /* Connect data for import were changed */
+ OBD_NOTIFY_OCD,
+ /* Sync request */
+ OBD_NOTIFY_SYNC_NONBLOCK,
+ OBD_NOTIFY_SYNC,
+ /* Configuration event */
+ OBD_NOTIFY_CONFIG,
+ /* Administratively deactivate/activate event */
+ OBD_NOTIFY_DEACTIVATE,
+ OBD_NOTIFY_ACTIVATE
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+ int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+ enum obd_notify_event ev, void *owner, void *data);
+ /* Opaque datum supplied by upper layer listener */
+ void *onu_owner;
+};
+
+struct target_recovery_data {
+ svc_handler_t trd_recovery_handler;
+ pid_t trd_processing_task;
+ struct completion trd_starting;
+ struct completion trd_finishing;
+};
+
+struct obd_llog_group {
+ int olg_seq;
+ struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS];
+ wait_queue_head_t olg_waitq;
+ spinlock_t olg_lock;
+ struct mutex olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC 0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME 0xffffd0de
+
+struct lvfs_run_ctxt {
+ struct dt_device *dt;
+};
+
+struct obd_device {
+ struct obd_type *obd_type;
+ __u32 obd_magic;
+
+ /* common and UUID name of this device */
+ char obd_name[MAX_OBD_NAME];
+ struct obd_uuid obd_uuid;
+
+ struct lu_device *obd_lu_dev;
+
+ int obd_minor;
+ /* bitfield modification is protected by obd_dev_lock */
+ unsigned long obd_attached:1, /* finished attach */
+ obd_set_up:1, /* finished setup */
+ obd_recovering:1, /* there are recoverable clients */
+ obd_abort_recovery:1,/* recovery expired */
+ obd_version_recov:1, /* obd uses version checking */
+ obd_replayable:1, /* recovery is enabled; inform clients */
+ obd_no_transno:1, /* no committed-transno notification */
+ obd_no_recov:1, /* fail instead of retry messages */
+ obd_stopping:1, /* started cleanup */
+ obd_starting:1, /* started setup */
+ obd_force:1, /* cleanup with > 0 obd refcount */
+ obd_fail:1, /* cleanup with failover */
+ obd_async_recov:1, /* allow asynchronous orphan cleanup */
+ obd_no_conn:1, /* deny new connections */
+ obd_inactive:1, /* device active/inactive
+ * (for /proc/status only!!) */
+ obd_no_ir:1, /* no imperative recovery. */
+ obd_process_conf:1; /* device is processing mgs config */
+ /* use separate field as it is set in interrupt to don't mess with
+ * protection of other bits using _bh lock */
+ unsigned long obd_recovery_expired:1;
+ /* uuid-export hash body */
+ struct cfs_hash *obd_uuid_hash;
+ /* nid-export hash body */
+ struct cfs_hash *obd_nid_hash;
+ /* nid stats body */
+ struct cfs_hash *obd_nid_stats_hash;
+ struct list_head obd_nid_stats;
+ atomic_t obd_refcount;
+ wait_queue_head_t obd_refcount_waitq;
+ struct list_head obd_exports;
+ struct list_head obd_unlinked_exports;
+ struct list_head obd_delayed_exports;
+ int obd_num_exports;
+ spinlock_t obd_nid_lock;
+ struct ldlm_namespace *obd_namespace;
+ struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */
+ /* a spinlock is OK for what we do now, may need a semaphore later */
+ spinlock_t obd_dev_lock; /* protect OBD bitfield above */
+ struct mutex obd_dev_mutex;
+ __u64 obd_last_committed;
+ spinlock_t obd_osfs_lock;
+ struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */
+ __u64 obd_osfs_age;
+ struct lvfs_run_ctxt obd_lvfs_ctxt;
+ struct obd_llog_group obd_olg; /* default llog group */
+ struct obd_device *obd_observer;
+ struct rw_semaphore obd_observer_link_sem;
+ struct obd_notify_upcall obd_upcall;
+ struct obd_export *obd_self_export;
+ /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+ struct list_head obd_exports_timed;
+ time_t obd_eviction_timer; /* for ping evictor */
+
+ int obd_max_recoverable_clients;
+ atomic_t obd_connected_clients;
+ int obd_stale_clients;
+ int obd_delayed_clients;
+ /* this lock protects all recovery list_heads, timer and
+ * obd_next_recovery_transno value */
+ spinlock_t obd_recovery_task_lock;
+ __u64 obd_next_recovery_transno;
+ int obd_replayed_requests;
+ int obd_requests_queued_for_recovery;
+ wait_queue_head_t obd_next_transno_waitq;
+ /* protected by obd_recovery_task_lock */
+ struct timer_list obd_recovery_timer;
+ time_t obd_recovery_start; /* seconds */
+ time_t obd_recovery_end; /* seconds, for lprocfs_status */
+ int obd_recovery_time_hard;
+ int obd_recovery_timeout;
+ int obd_recovery_ir_factor;
+
+ /* new recovery stuff from CMD2 */
+ struct target_recovery_data obd_recovery_data;
+ int obd_replayed_locks;
+ atomic_t obd_req_replay_clients;
+ atomic_t obd_lock_replay_clients;
+ /* all lists are protected by obd_recovery_task_lock */
+ struct list_head obd_req_replay_queue;
+ struct list_head obd_lock_replay_queue;
+ struct list_head obd_final_req_queue;
+ int obd_recovery_stage;
+
+ union {
+ struct client_obd cli;
+ struct echo_client_obd echo_client;
+ struct lov_obd lov;
+ struct lmv_obd lmv;
+ } u;
+ /* Fields used by LProcFS */
+ unsigned int obd_cntr_base;
+ struct lprocfs_stats *obd_stats;
+
+ unsigned int md_cntr_base;
+ struct lprocfs_stats *md_stats;
+
+ struct proc_dir_entry *obd_proc_entry;
+ void *obd_proc_private; /* type private PDEs */
+ struct proc_dir_entry *obd_proc_exports_entry;
+ struct proc_dir_entry *obd_svc_procroot;
+ struct lprocfs_stats *obd_svc_stats;
+ atomic_t obd_evict_inprogress;
+ wait_queue_head_t obd_evict_inprogress_waitq;
+ struct list_head obd_evict_list; /* protected with pet_lock */
+
+ /**
+ * Ldlm pool part. Save last calculated SLV and Limit.
+ */
+ rwlock_t obd_pool_lock;
+ int obd_pool_limit;
+ __u64 obd_pool_slv;
+
+ /**
+ * A list of outstanding class_incref()'s against this obd. For
+ * debugging.
+ */
+ struct lu_ref obd_reference;
+
+ int obd_conn_inprogress;
+};
+
+#define OBD_LLOG_FL_SENDNOW 0x0001
+#define OBD_LLOG_FL_EXIT 0x0002
+
+enum obd_cleanup_stage {
+/* Special case hack for MDS LOVs */
+ OBD_CLEANUP_EARLY,
+/* can be directly mapped to .ldto_device_fini() */
+ OBD_CLEANUP_EXPORTS,
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC "async"
+#define KEY_BLOCKSIZE_BITS "blocksize_bits"
+#define KEY_BLOCKSIZE "blocksize"
+#define KEY_CAPA_KEY "capa_key"
+#define KEY_CHANGELOG_CLEAR "changelog_clear"
+#define KEY_FID2PATH "fid2path"
+#define KEY_CHECKSUM "checksum"
+#define KEY_CLEAR_FS "clear_fs"
+#define KEY_CONN_DATA "conn_data"
+#define KEY_EVICT_BY_NID "evict_by_nid"
+#define KEY_FIEMAP "fiemap"
+#define KEY_FLUSH_CTX "flush_ctx"
+#define KEY_GRANT_SHRINK "grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND "hsm_send"
+#define KEY_INIT_RECOV_BACKUP "init_recov_bk"
+#define KEY_INIT_RECOV "initial_recov"
+#define KEY_INTERMDS "inter_mds"
+#define KEY_LAST_ID "last_id"
+#define KEY_LAST_FID "last_fid"
+#define KEY_LOCK_TO_STRIPE "lock_to_stripe"
+#define KEY_LOVDESC "lovdesc"
+#define KEY_LOV_IDX "lov_idx"
+#define KEY_MAX_EASIZE "max_easize"
+#define KEY_DEFAULT_EASIZE "default_easize"
+#define KEY_MAX_COOKIESIZE "max_cookiesize"
+#define KEY_DEFAULT_COOKIESIZE "default_cookiesize"
+#define KEY_MDS_CONN "mds_conn"
+#define KEY_MGSSEC "mgssec"
+#define KEY_NEXT_ID "next_id"
+#define KEY_READ_ONLY "read-only"
+#define KEY_REGISTER_TARGET "register_target"
+#define KEY_SET_FS "set_fs"
+#define KEY_TGT_COUNT "tgt_count"
+/* KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF "sptlrpc_conf"
+#define KEY_CONNECT_FLAG "connect_flags"
+#define KEY_SYNC_LOCK_CANCEL "sync_lock_cancel"
+
+#define KEY_CACHE_SET "cache_set"
+#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink"
+#define KEY_CHANGELOG_INDEX "changelog_index"
+
+struct lu_context;
+
+/* /!\ must be coherent with include/linux/namei.h on patched kernel */
+#define IT_OPEN (1 << 0)
+#define IT_CREAT (1 << 1)
+#define IT_READDIR (1 << 2)
+#define IT_GETATTR (1 << 3)
+#define IT_LOOKUP (1 << 4)
+#define IT_UNLINK (1 << 5)
+#define IT_TRUNC (1 << 6)
+#define IT_GETXATTR (1 << 7)
+#define IT_EXEC (1 << 8)
+#define IT_PIN (1 << 9)
+#define IT_LAYOUT (1 << 10)
+#define IT_QUOTA_DQACQ (1 << 11)
+#define IT_QUOTA_CONN (1 << 12)
+#define IT_SETXATTR (1 << 13)
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+ /* CREAT needs to be tested before open (both could be set) */
+ if (it->it_op & IT_CREAT)
+ return LCK_CW;
+ else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP |
+ IT_LAYOUT))
+ return LCK_CR;
+ else if (it->it_op & IT_GETXATTR)
+ return LCK_PR;
+ else if (it->it_op & IT_SETXATTR)
+ return LCK_PW;
+
+ LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+ return -EINVAL;
+}
+
+struct md_op_data {
+ struct lu_fid op_fid1; /* operation fid1 (usually parent) */
+ struct lu_fid op_fid2; /* operation fid2 (usually child) */
+ struct lu_fid op_fid3; /* 2 extra fids to find conflicting */
+ struct lu_fid op_fid4; /* to the operation locks. */
+ u32 op_mds; /* what mds server open will go to */
+ struct lustre_handle op_handle;
+ s64 op_mod_time;
+ const char *op_name;
+ int op_namelen;
+ __u32 op_mode;
+ struct lmv_stripe_md *op_mea1;
+ struct lmv_stripe_md *op_mea2;
+ __u32 op_suppgids[2];
+ __u32 op_fsuid;
+ __u32 op_fsgid;
+ cfs_cap_t op_cap;
+ void *op_data;
+
+ /* iattr fields and blocks. */
+ struct iattr op_attr;
+ unsigned int op_attr_flags;
+ __u64 op_valid;
+ loff_t op_attr_blocks;
+
+ /* Size-on-MDS epoch and flags. */
+ __u64 op_ioepoch;
+ __u32 op_flags;
+
+ /* Capa fields */
+ struct obd_capa *op_capa1;
+ struct obd_capa *op_capa2;
+
+ /* Various operation flags. */
+ enum mds_op_bias op_bias;
+
+ /* Operation type */
+ __u32 op_opc;
+
+ /* Used by readdir */
+ __u64 op_offset;
+
+ /* Used by readdir */
+ __u32 op_npages;
+
+ /* used to transfer info between the stacks of MD client
+ * see enum op_cli_flags */
+ __u32 op_cli_flags;
+
+ /* File object data version for HSM release, on client */
+ __u64 op_data_version;
+ struct lustre_handle op_lease_handle;
+};
+
+enum op_cli_flags {
+ CLI_SET_MEA = 1 << 0,
+ CLI_RM_ENTRY = 1 << 1,
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+
+struct md_enqueue_info {
+ struct md_op_data mi_data;
+ struct lookup_intent mi_it;
+ struct lustre_handle mi_lockh;
+ struct inode *mi_dir;
+ int (*mi_cb)(struct ptlrpc_request *req,
+ struct md_enqueue_info *minfo, int rc);
+ __u64 mi_cbdata;
+ unsigned int mi_generation;
+};
+
+struct obd_ops {
+ struct module *o_owner;
+ int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+ void *karg, void *uarg);
+ int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+ __u32 keylen, void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm);
+ int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+ __u32 keylen, void *key,
+ __u32 vallen, void *val,
+ struct ptlrpc_request_set *set);
+ int (*o_attach)(struct obd_device *dev, u32 len, void *data);
+ int (*o_detach)(struct obd_device *dev);
+ int (*o_setup)(struct obd_device *dev, struct lustre_cfg *cfg);
+ int (*o_precleanup)(struct obd_device *dev,
+ enum obd_cleanup_stage cleanup_stage);
+ int (*o_cleanup)(struct obd_device *dev);
+ int (*o_process_config)(struct obd_device *dev, u32 len, void *data);
+ int (*o_postrecov)(struct obd_device *dev);
+ int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority);
+ int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+ /* connect to the target device with given connection
+ * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+ * granted by the target, which are guaranteed to be a subset of flags
+ * asked for. If @ocd == NULL, use default parameters. */
+ int (*o_connect)(const struct lu_env *env,
+ struct obd_export **exp, struct obd_device *src,
+ struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+ void *localdata);
+ int (*o_reconnect)(const struct lu_env *env,
+ struct obd_export *exp, struct obd_device *src,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *ocd,
+ void *localdata);
+ int (*o_disconnect)(struct obd_export *exp);
+
+ /* Initialize/finalize fids infrastructure. */
+ int (*o_fid_init)(struct obd_device *obd,
+ struct obd_export *exp, enum lu_cli_type type);
+ int (*o_fid_fini)(struct obd_device *obd);
+
+ /* Allocate new fid according to passed @hint. */
+ int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid,
+ struct md_op_data *op_data);
+
+ /*
+ * Object with @fid is getting deleted, we may want to do something
+ * about this.
+ */
+ int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+ int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+ __u64 max_age, struct ptlrpc_request_set *set);
+ int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
+ struct lov_stripe_md *mem_src);
+ int (*o_unpackmd)(struct obd_export *exp,
+ struct lov_stripe_md **mem_tgt,
+ struct lov_mds_md *disk_src, int disk_len);
+ int (*o_preallocate)(struct lustre_handle *, u32 *req, u64 *ids);
+ /* FIXME: add fid capability support for create & destroy! */
+ int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md **ea,
+ struct obd_trans_info *oti);
+ int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md *ea,
+ struct obd_trans_info *oti, struct obd_export *md_exp,
+ void *capa);
+ int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+ struct obd_info *oinfo, struct obd_trans_info *oti);
+ int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset);
+ int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo);
+ int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+ struct ptlrpc_request_set *set);
+ int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
+ u64 size, int shrink);
+ int (*o_preprw)(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa, int objcount,
+ struct obd_ioobj *obj, struct niobuf_remote *remote,
+ int *nr_pages, struct niobuf_local *local,
+ struct obd_trans_info *oti, struct lustre_capa *capa);
+ int (*o_commitrw)(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa,
+ int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *remote, int pages,
+ struct niobuf_local *local,
+ struct obd_trans_info *oti, int rc);
+ int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *,
+ ldlm_iterator_t it, void *data);
+ int (*o_init_export)(struct obd_export *exp);
+ int (*o_destroy_export)(struct obd_export *exp);
+
+ /* metadata-only methods */
+ int (*o_import_event)(struct obd_device *, struct obd_import *,
+ enum obd_import_event);
+
+ int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+ enum obd_notify_event ev, void *data);
+
+ int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+ struct obd_uuid *(*o_get_uuid)(struct obd_export *exp);
+
+ /* quota methods */
+ int (*o_quotacheck)(struct obd_device *, struct obd_export *,
+ struct obd_quotactl *);
+ int (*o_quotactl)(struct obd_device *, struct obd_export *,
+ struct obd_quotactl *);
+
+ /* pools methods */
+ int (*o_pool_new)(struct obd_device *obd, char *poolname);
+ int (*o_pool_del)(struct obd_device *obd, char *poolname);
+ int (*o_pool_add)(struct obd_device *obd, char *poolname,
+ char *ostname);
+ int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+ char *ostname);
+ void (*o_getref)(struct obd_device *obd);
+ void (*o_putref)(struct obd_device *obd);
+ /*
+ * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+ * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+ * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+enum {
+ LUSTRE_OPC_MKDIR = (1 << 0),
+ LUSTRE_OPC_SYMLINK = (1 << 1),
+ LUSTRE_OPC_MKNOD = (1 << 2),
+ LUSTRE_OPC_CREATE = (1 << 3),
+ LUSTRE_OPC_ANY = (1 << 4)
+};
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR 0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS 0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b
+
+#define MAX_HASH_SIZE_32 0x7fffffffUL
+#define MAX_HASH_SIZE 0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL
+
+struct lustre_md {
+ struct mdt_body *body;
+ struct lov_stripe_md *lsm;
+ struct lmv_stripe_md *mea;
+#ifdef CONFIG_FS_POSIX_ACL
+ struct posix_acl *posix_acl;
+#endif
+ struct mdt_remote_perm *remote_perm;
+ struct obd_capa *mds_capa;
+ struct obd_capa *oss_capa;
+};
+
+struct md_open_data {
+ struct obd_client_handle *mod_och;
+ struct ptlrpc_request *mod_open_req;
+ struct ptlrpc_request *mod_close_req;
+ atomic_t mod_refcount;
+ bool mod_is_create;
+};
+
+struct lookup_intent;
+
+struct md_ops {
+ int (*m_getstatus)(struct obd_export *, struct lu_fid *,
+ struct obd_capa **);
+ int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+ int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *,
+ ldlm_iterator_t, void *);
+ int (*m_close)(struct obd_export *, struct md_op_data *,
+ struct md_open_data *, struct ptlrpc_request **);
+ int (*m_create)(struct obd_export *, struct md_op_data *,
+ const void *, int, int, __u32, __u32, cfs_cap_t,
+ __u64, struct ptlrpc_request **);
+ int (*m_done_writing)(struct obd_export *, struct md_op_data *,
+ struct md_open_data *);
+ int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+ struct lookup_intent *, struct md_op_data *,
+ struct lustre_handle *, void *, int,
+ struct ptlrpc_request **, __u64);
+ int (*m_getattr)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+ int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+ int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+ void *, int, struct lookup_intent *, int,
+ struct ptlrpc_request **,
+ ldlm_blocking_callback, __u64);
+ int (*m_link)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+ int (*m_rename)(struct obd_export *, struct md_op_data *,
+ const char *, int, const char *, int,
+ struct ptlrpc_request **);
+ int (*m_is_subdir)(struct obd_export *, const struct lu_fid *,
+ const struct lu_fid *,
+ struct ptlrpc_request **);
+ int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+ int , void *, int, struct ptlrpc_request **,
+ struct md_open_data **mod);
+ int (*m_sync)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, struct ptlrpc_request **);
+ int (*m_readpage)(struct obd_export *, struct md_op_data *,
+ struct page **, struct ptlrpc_request **);
+
+ int (*m_unlink)(struct obd_export *, struct md_op_data *,
+ struct ptlrpc_request **);
+
+ int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, u64, const char *,
+ const char *, int, int, int, __u32,
+ struct ptlrpc_request **);
+
+ int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, u64, const char *,
+ const char *, int, int, int,
+ struct ptlrpc_request **);
+
+ int (*m_init_ea_size)(struct obd_export *, int, int, int, int);
+
+ int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+ struct obd_export *, struct obd_export *,
+ struct lustre_md *);
+
+ int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+ int (*m_set_open_replay_data)(struct obd_export *,
+ struct obd_client_handle *,
+ struct lookup_intent *);
+ int (*m_clear_open_replay_data)(struct obd_export *,
+ struct obd_client_handle *);
+ int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *);
+
+ ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64,
+ const struct lu_fid *, ldlm_type_t,
+ ldlm_policy_data_t *, ldlm_mode_t,
+ struct lustre_handle *);
+
+ int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+ ldlm_policy_data_t *, ldlm_mode_t,
+ ldlm_cancel_flags_t flags, void *opaque);
+ int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc,
+ renew_capa_cb_t cb);
+ int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *,
+ const struct req_msg_field *, struct obd_capa **);
+
+ int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *,
+ struct obd_capa *, __u32,
+ struct ptlrpc_request **);
+
+ int (*m_intent_getattr_async)(struct obd_export *,
+ struct md_enqueue_info *,
+ struct ldlm_enqueue_info *);
+
+ int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+ struct lu_fid *, __u64 *bits);
+
+ /*
+ * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to
+ * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a
+ * wrapper function in include/linux/obd_class.h.
+ */
+};
+
+struct lsm_operations {
+ void (*lsm_free)(struct lov_stripe_md *);
+ int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+ struct obd_export *md_exp);
+ void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, u64 *,
+ u64 *);
+ void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, u64 *,
+ u64 *);
+ int (*lsm_lmm_verify)(struct lov_mds_md *lmm, int lmm_bytes,
+ __u16 *stripe_count);
+ int (*lsm_unpackmd)(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmm);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+ switch (magic) {
+ case LOV_MAGIC_V1:
+ return &lsm_v1_ops;
+ case LOV_MAGIC_V3:
+ return &lsm_v3_ops;
+ default:
+ CERROR("Cannot recognize lsm_magic %08x\n", magic);
+ return NULL;
+ }
+}
+
+/* Requests for obd_extent_calc() */
+#define OBD_CALC_STRIPE_START 1
+#define OBD_CALC_STRIPE_END 2
+
+static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)
+{
+ return oinfo->oi_capa;
+}
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+ struct md_open_data *mod;
+
+ OBD_ALLOC_PTR(mod);
+ if (mod == NULL)
+ return NULL;
+ atomic_set(&mod->mod_refcount, 1);
+ return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod) \
+({ \
+ if (atomic_dec_and_test(&(mod)->mod_refcount)) { \
+ if ((mod)->mod_open_req) \
+ ptlrpc_req_finished((mod)->mod_open_req); \
+ OBD_FREE_PTR(mod); \
+ } \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+ return atomic_read(&cli->cl_resends) ?
+ atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+ return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, int namelen, int *idx)
+{
+ const char *start;
+ char *end;
+
+ if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+ return false;
+
+ /* caller does not care of idx */
+ if (idx == NULL)
+ return true;
+
+ /* volatile file, the MDT can be set from name */
+ /* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+ /* if no MDT is specified, use std way */
+ if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+ goto bad_format;
+ /* test for no MDT idx case */
+ if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+ (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+ *idx = -1;
+ return true;
+ }
+ /* we have an idx, read it */
+ start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+ *idx = strtoul(start, &end, 0);
+ /* error cases:
+ * no digit, no trailing :, negative value
+ */
+ if (((*idx == 0) && (end == start)) ||
+ (*end != ':') || (*idx < 0))
+ goto bad_format;
+
+ return true;
+bad_format:
+ /* bad format of mdt idx, we cannot return an error
+ * to caller so we use hash algo */
+ CERROR("Bad volatile file name format: %s\n",
+ name + LUSTRE_VOLATILE_HDR_LEN);
+ return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+ LASSERT(obd != NULL);
+ return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h
new file mode 100644
index 000000000..c8249fbb0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cache.h
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h
new file mode 100644
index 000000000..3a63462aa
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_cksum.h
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include "../../include/linux/libcfs/libcfs.h"
+#include "lustre/lustre_idl.h"
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+ switch (cksum_type) {
+ case OBD_CKSUM_CRC32:
+ return CFS_HASH_ALG_CRC32;
+ case OBD_CKSUM_ADLER:
+ return CFS_HASH_ALG_ADLER32;
+ case OBD_CKSUM_CRC32C:
+ return CFS_HASH_ALG_CRC32C;
+ default:
+ CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+ LBUG();
+ }
+ return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline u32 cksum_type_pack(cksum_type_t cksum_type)
+{
+ unsigned int performance = 0, tmp;
+ u32 flag = OBD_FL_CKSUM_ADLER;
+
+ if (cksum_type & OBD_CKSUM_CRC32) {
+ tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+ if (tmp > performance) {
+ performance = tmp;
+ flag = OBD_FL_CKSUM_CRC32;
+ }
+ }
+ if (cksum_type & OBD_CKSUM_CRC32C) {
+ tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+ if (tmp > performance) {
+ performance = tmp;
+ flag = OBD_FL_CKSUM_CRC32C;
+ }
+ }
+ if (cksum_type & OBD_CKSUM_ADLER) {
+ tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+ if (tmp > performance) {
+ performance = tmp;
+ flag = OBD_FL_CKSUM_ADLER;
+ }
+ }
+ if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+ OBD_CKSUM_CRC32 |
+ OBD_CKSUM_ADLER))))
+ CWARN("unknown cksum type %x\n", cksum_type);
+
+ return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(u32 o_flags)
+{
+ switch (o_flags & OBD_FL_CKSUM_ALL) {
+ case OBD_FL_CKSUM_CRC32C:
+ return OBD_CKSUM_CRC32C;
+ case OBD_FL_CKSUM_CRC32:
+ return OBD_CKSUM_CRC32;
+ default:
+ break;
+ }
+
+ return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+ cksum_type_t ret = OBD_CKSUM_ADLER;
+
+ CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+ ret |= OBD_CKSUM_CRC32C;
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+ ret |= OBD_CKSUM_CRC32;
+
+ return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline cksum_type_t cksum_types_supported_server(void)
+{
+ int base_speed;
+ cksum_type_t ret = OBD_CKSUM_ADLER;
+
+ CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+ cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+ base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+ base_speed)
+ ret |= OBD_CKSUM_CRC32C;
+ if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+ base_speed)
+ ret |= OBD_CKSUM_CRC32;
+
+ return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server. */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+ return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h
new file mode 100644
index 000000000..34b5fa3f0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_class.h
@@ -0,0 +1,1929 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include "obd_support.h"
+#include "lustre_import.h"
+#include "lustre_net.h"
+#include "obd.h"
+#include "lustre_lib.h"
+#include "lustre/lustre_idl.h"
+#include "lprocfs_status.h"
+
+#define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay
+ * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update
+ * obd_osfs_age */
+#define OBD_STATFS_PTLRPCD 0x0004 /* requests will be sent via ptlrpcd
+ * instead of a specific set. This
+ * means that we cannot rely on the set
+ * interpret routine to be called.
+ * lov_statfs_fini() must thus be called
+ * by the request interpret routine */
+#define OBD_STATFS_FOR_MDT0 0x0008 /* The statfs is only for retrieving
+ * information from MDT0. */
+#define OBD_FL_PUNCH 0x00000001 /* To indicate it is punch operation */
+
+/* OBD Device Declarations */
+extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+extern struct list_head obd_types;
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *,
+ struct lprocfs_vars *, const char *nm,
+ struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
+
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid,
+ const char *typ_name,
+ struct obd_uuid *grp_uuid);
+struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid,
+ int *next);
+struct obd_device *class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr *kuc_ptr(void *p);
+int kuc_ispayload(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+ struct llog_rec_hdr *, void *);
+/* obd_config.c */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+ const char *new_name);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+ struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd,
+ const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+ const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks);
+int class_config_llog_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_add_uuid(const char *uuid, __u64 nid);
+
+/*obdecho*/
+#if defined (CONFIG_PROC_FS)
+extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+#define CFG_F_START 0x01 /* Set when we start updating from a log */
+#define CFG_F_MARKER 0x02 /* We are within a maker */
+#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08 /* Allow old-style logs */
+#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+ char *cfg_obdname;
+ void *cfg_instance;
+ struct super_block *cfg_sb;
+ struct obd_uuid cfg_uuid;
+ llog_cb_t cfg_callback;
+ int cfg_last_idx; /* for partial llog processing */
+ int cfg_flags;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg);
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg);
+
+enum {
+ CONFIG_T_CONFIG = 0,
+ CONFIG_T_SPTLRPC = 1,
+ CONFIG_T_RECOVER = 2,
+ CONFIG_T_PARAMS = 3,
+ CONFIG_T_MAX = 4
+};
+
+#define PARAMS_FILENAME "params"
+#define LCTL_UPCALL "lctl"
+
+/* list of active configuration logs */
+struct config_llog_data {
+ struct ldlm_res_id cld_resid;
+ struct config_llog_instance cld_cfg;
+ struct list_head cld_list_chain;
+ atomic_t cld_refcount;
+ struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */
+ struct config_llog_data *cld_params; /* common parameters log */
+ struct config_llog_data *cld_recover;/* imperative recover log */
+ struct obd_export *cld_mgcexp;
+ struct mutex cld_lock;
+ int cld_type;
+ unsigned int cld_stopping:1, /* we were told to stop
+ * watching */
+ cld_lostlock:1; /* lock not requeued */
+ char cld_logname[0];
+};
+
+struct lustre_profile {
+ struct list_head lp_list;
+ char *lp_profile;
+ char *lp_dt;
+ char *lp_md;
+};
+
+struct lustre_profile *class_get_profile(const char *prof);
+void class_del_profile(const char *prof);
+void class_del_profiles(void);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock) do {} while (0)
+#define __class_export_del_lock_ref(exp, lock) do {} while (0)
+
+#endif
+
+static inline void class_export_rpc_inc(struct obd_export *exp)
+{
+ atomic_inc(&(exp)->exp_rpc_count);
+ CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",
+ (exp), atomic_read(&(exp)->exp_rpc_count));
+}
+
+static inline void class_export_rpc_dec(struct obd_export *exp)
+{
+ LASSERT_ATOMIC_POS(&exp->exp_rpc_count);
+ atomic_dec(&(exp)->exp_rpc_count);
+ CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",
+ (exp), atomic_read(&(exp)->exp_rpc_count));
+}
+
+#define class_export_lock_get(exp, lock) \
+({ \
+ atomic_inc(&(exp)->exp_locks_count); \
+ __class_export_add_lock_ref(exp, lock); \
+ CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+ (exp), atomic_read(&(exp)->exp_locks_count)); \
+ class_export_get(exp); \
+})
+
+#define class_export_lock_put(exp, lock) \
+({ \
+ LASSERT_ATOMIC_POS(&exp->exp_locks_count); \
+ atomic_dec(&(exp)->exp_locks_count); \
+ __class_export_del_lock_ref(exp, lock); \
+ CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+ (exp), atomic_read(&(exp)->exp_locks_count)); \
+ class_export_put(exp); \
+})
+
+#define class_export_cb_get(exp) \
+({ \
+ atomic_inc(&(exp)->exp_cb_count); \
+ CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+ (exp), atomic_read(&(exp)->exp_cb_count)); \
+ class_export_get(exp); \
+})
+
+#define class_export_cb_put(exp) \
+({ \
+ LASSERT_ATOMIC_POS(&exp->exp_cb_count); \
+ atomic_dec(&(exp)->exp_cb_count); \
+ CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+ (exp), atomic_read(&(exp)->exp_cb_count)); \
+ class_export_put(exp); \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+ struct obd_uuid *cluuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+ struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+ int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+ return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+ (obd->obd_force ? OBD_OPT_FORCE : 0) |
+ (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+ 0);
+}
+
+struct inode;
+struct lu_attr;
+struct obdo;
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid);
+void la_from_obdo(struct lu_attr *la, struct obdo *dst, u32 valid);
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid);
+void obdo_to_inode(struct inode *dst, struct obdo *src, u32 valid);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, u32 valid);
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr,
+ unsigned int ia_valid);
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid);
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid);
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+ unsigned int valid);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo);
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo);
+
+#define OBT(dev) (dev)->obd_type
+#define OBP(dev, op) (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op) (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+ while obd is stopping */
+static inline int obd_check_dev(struct obd_device *obd)
+{
+ if (!obd) {
+ CERROR("NULL device\n");
+ return -ENODEV;
+ }
+ return 0;
+}
+
+/* ensure obd_setup and !obd_stopping */
+static inline int obd_check_dev_active(struct obd_device *obd)
+{
+ int rc;
+
+ rc = obd_check_dev(obd);
+ if (rc)
+ return rc;
+ if (!obd->obd_set_up || obd->obd_stopping) {
+ CERROR("Device %d not setup\n", obd->obd_minor);
+ return -ENODEV;
+ }
+ return rc;
+}
+
+#if defined (CONFIG_PROC_FS)
+#define OBD_COUNTER_OFFSET(op) \
+ ((offsetof(struct obd_ops, o_ ## op) - \
+ offsetof(struct obd_ops, o_iocontrol)) \
+ / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obdx, op) \
+ if ((obdx)->obd_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((obdx)->obd_cntr_base) + \
+ OBD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (obdx)->obd_stats->ls_num); \
+ lprocfs_counter_incr((obdx)->obd_stats, coffset); \
+ }
+
+#define EXP_COUNTER_INCREMENT(export, op) \
+ if ((export)->exp_obd->obd_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \
+ OBD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num); \
+ lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \
+ if ((export)->exp_nid_stats != NULL && \
+ (export)->exp_nid_stats->nid_stats != NULL) \
+ lprocfs_counter_incr( \
+ (export)->exp_nid_stats->nid_stats, coffset);\
+ }
+
+#define MD_COUNTER_OFFSET(op) \
+ ((offsetof(struct md_ops, m_ ## op) - \
+ offsetof(struct md_ops, m_getstatus)) \
+ / sizeof(((struct md_ops *)(0))->m_getstatus))
+
+#define MD_COUNTER_INCREMENT(obdx, op) \
+ if ((obd)->md_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((obdx)->md_cntr_base) + \
+ MD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (obdx)->md_stats->ls_num); \
+ lprocfs_counter_incr((obdx)->md_stats, coffset); \
+ }
+
+#define EXP_MD_COUNTER_INCREMENT(export, op) \
+ if ((export)->exp_obd->obd_stats != NULL) { \
+ unsigned int coffset; \
+ coffset = (unsigned int)((export)->exp_obd->md_cntr_base) + \
+ MD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < (export)->exp_obd->md_stats->ls_num); \
+ lprocfs_counter_incr((export)->exp_obd->md_stats, coffset); \
+ if ((export)->exp_md_stats != NULL) \
+ lprocfs_counter_incr( \
+ (export)->exp_md_stats, coffset); \
+ }
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define MD_COUNTER_INCREMENT(obd, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat *tmp)
+{
+ /* Always add in ldlm_stats */
+ tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC
+ ,LPROCFS_STATS_FLAG_NOPERCPU);
+ if (tmp->nid_ldlm_stats == NULL)
+ return -ENOMEM;
+
+ lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+ return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+ tmp->nid_ldlm_stats);
+}
+
+#define OBD_CHECK_MD_OP(obd, op, err) \
+do { \
+ if (!OBT(obd) || !MDP((obd), op)) { \
+ if (err) \
+ CERROR("md_" #op ": dev %s/%d no operation\n", \
+ obd->obd_name, obd->obd_minor); \
+ return err; \
+ } \
+} while (0)
+
+#define EXP_CHECK_MD_OP(exp, op) \
+do { \
+ if ((exp) == NULL) { \
+ CERROR("obd_" #op ": NULL export\n"); \
+ return -ENODEV; \
+ } \
+ if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \
+ CERROR("obd_" #op ": cleaned up obd\n"); \
+ return -EOPNOTSUPP; \
+ } \
+ if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \
+ CERROR("obd_" #op ": dev %s/%d no operation\n", \
+ (exp)->exp_obd->obd_name, \
+ (exp)->exp_obd->obd_minor); \
+ return -EOPNOTSUPP; \
+ } \
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err) \
+do { \
+ if (!OBT(obd) || !OBP((obd), op)) { \
+ if (err) \
+ CERROR("obd_" #op ": dev %d no operation\n", \
+ obd->obd_minor); \
+ return err; \
+ } \
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op) \
+do { \
+ if ((exp) == NULL) { \
+ CERROR("obd_" #op ": NULL export\n"); \
+ return -ENODEV; \
+ } \
+ if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \
+ CERROR("obd_" #op ": cleaned up obd\n"); \
+ return -EOPNOTSUPP; \
+ } \
+ if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \
+ CERROR("obd_" #op ": dev %d no operation\n", \
+ (exp)->exp_obd->obd_minor); \
+ return -EOPNOTSUPP; \
+ } \
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err) \
+do { \
+ if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) { \
+ if (err) \
+ CERROR("lop_" #op ": dev %d no operation\n", \
+ ctxt->loc_obd->obd_minor); \
+ return err; \
+ } \
+} while (0)
+
+static inline int class_devno_max(void)
+{
+ return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env,
+ struct obd_export *exp, __u32 keylen,
+ void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, get_info);
+ EXP_COUNTER_INCREMENT(exp, get_info);
+
+ rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val,
+ lsm);
+ return rc;
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+ struct obd_export *exp, u32 keylen,
+ void *key, u32 vallen, void *val,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, set_info_async);
+ EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+ rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+ val, set);
+ return rc;
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d) \
+ struct lu_device_type *ldt; \
+ struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+
+ ldt = obd->obd_type->typ_lu;
+ if (ldt != NULL) {
+ struct lu_context session_ctx;
+ struct lu_env env;
+ lu_context_init(&session_ctx, LCT_SESSION);
+ session_ctx.lc_thread = NULL;
+ lu_context_enter(&session_ctx);
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ env.le_ses = &session_ctx;
+ d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+ lu_env_fini(&env);
+ if (!IS_ERR(d)) {
+ obd->obd_lu_dev = d;
+ d->ld_obd = obd;
+ rc = 0;
+ } else
+ rc = PTR_ERR(d);
+ }
+ lu_context_exit(&session_ctx);
+ lu_context_fini(&session_ctx);
+
+ } else {
+ OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, setup);
+ rc = OBP(obd, setup)(obd, cfg);
+ }
+ return rc;
+}
+
+static inline int obd_precleanup(struct obd_device *obd,
+ enum obd_cleanup_stage cleanup_stage)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+
+ rc = obd_check_dev(obd);
+ if (rc)
+ return rc;
+ ldt = obd->obd_type->typ_lu;
+ d = obd->obd_lu_dev;
+ if (ldt != NULL && d != NULL) {
+ if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
+ struct lu_env env;
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ ldt->ldt_ops->ldto_device_fini(&env, d);
+ lu_env_fini(&env);
+ }
+ }
+ }
+ OBD_CHECK_DT_OP(obd, precleanup, 0);
+ OBD_COUNTER_INCREMENT(obd, precleanup);
+
+ rc = OBP(obd, precleanup)(obd, cleanup_stage);
+ return rc;
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+
+ rc = obd_check_dev(obd);
+ if (rc)
+ return rc;
+
+ ldt = obd->obd_type->typ_lu;
+ d = obd->obd_lu_dev;
+ if (ldt != NULL && d != NULL) {
+ struct lu_env env;
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ ldt->ldt_ops->ldto_device_free(&env, d);
+ lu_env_fini(&env);
+ obd->obd_lu_dev = NULL;
+ }
+ }
+ OBD_CHECK_DT_OP(obd, cleanup, 0);
+ OBD_COUNTER_INCREMENT(obd, cleanup);
+
+ rc = OBP(obd, cleanup)(obd);
+ return rc;
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+ /* If we set up but never connected, the
+ client import will not have been cleaned. */
+ down_write(&obd->u.cli.cl_sem);
+ if (obd->u.cli.cl_import) {
+ struct obd_import *imp;
+ imp = obd->u.cli.cl_import;
+ CDEBUG(D_CONFIG, "%s: client import never connected\n",
+ obd->obd_name);
+ ptlrpc_invalidate_import(imp);
+ if (imp->imp_rq_pool) {
+ ptlrpc_free_rq_pool(imp->imp_rq_pool);
+ imp->imp_rq_pool = NULL;
+ }
+ client_destroy_import(imp);
+ obd->u.cli.cl_import = NULL;
+ }
+ up_write(&obd->u.cli.cl_sem);
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+ int rc;
+ DECLARE_LU_VARS(ldt, d);
+
+ rc = obd_check_dev(obd);
+ if (rc)
+ return rc;
+
+ obd->obd_process_conf = 1;
+ ldt = obd->obd_type->typ_lu;
+ d = obd->obd_lu_dev;
+ if (ldt != NULL && d != NULL) {
+ struct lu_env env;
+
+ rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+ if (rc == 0) {
+ rc = d->ld_ops->ldo_process_config(&env, d, data);
+ lu_env_fini(&env);
+ }
+ } else {
+ OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+ rc = OBP(obd, process_config)(obd, datalen, data);
+ }
+ OBD_COUNTER_INCREMENT(obd, process_config);
+ obd->obd_process_conf = 0;
+
+ return rc;
+}
+
+/* Pack an in-memory MD struct for storage on disk.
+ * Returns +ve size of packed MD (0 for free), or -ve error.
+ *
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
+ */
+static inline int obd_packmd(struct obd_export *exp,
+ struct lov_mds_md **disk_tgt,
+ struct lov_stripe_md *mem_src)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, packmd);
+ EXP_COUNTER_INCREMENT(exp, packmd);
+
+ rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src);
+ return rc;
+}
+
+static inline int obd_size_diskmd(struct obd_export *exp,
+ struct lov_stripe_md *mem_src)
+{
+ return obd_packmd(exp, NULL, mem_src);
+}
+
+static inline int obd_free_diskmd(struct obd_export *exp,
+ struct lov_mds_md **disk_tgt)
+{
+ LASSERT(disk_tgt);
+ LASSERT(*disk_tgt);
+ /*
+ * LU-2590, for caller's convenience, *disk_tgt could be host
+ * endianness, it needs swab to LE if necessary, while just
+ * lov_mds_md header needs it for figuring out how much memory
+ * needs to be freed.
+ */
+ if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+ (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) ||
+ ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3)))
+ lustre_swab_lov_mds_md(*disk_tgt);
+ return obd_packmd(exp, disk_tgt, NULL);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
+ * If @*mem_tgt == NULL, it will be allocated
+ */
+static inline int obd_unpackmd(struct obd_export *exp,
+ struct lov_stripe_md **mem_tgt,
+ struct lov_mds_md *disk_src,
+ int disk_len)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, unpackmd);
+ EXP_COUNTER_INCREMENT(exp, unpackmd);
+
+ rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len);
+ return rc;
+}
+
+/* helper functions */
+static inline int obd_alloc_memmd(struct obd_export *exp,
+ struct lov_stripe_md **mem_tgt)
+{
+ LASSERT(mem_tgt);
+ LASSERT(*mem_tgt == NULL);
+ return obd_unpackmd(exp, mem_tgt, NULL, 0);
+}
+
+static inline int obd_free_memmd(struct obd_export *exp,
+ struct lov_stripe_md **mem_tgt)
+{
+ int rc;
+
+ LASSERT(mem_tgt);
+ LASSERT(*mem_tgt);
+ rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+ *mem_tgt = NULL;
+ return rc;
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *obdo, struct lov_stripe_md **ea,
+ struct obd_trans_info *oti)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, create);
+ EXP_COUNTER_INCREMENT(exp, create);
+
+ rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti);
+ return rc;
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *obdo, struct lov_stripe_md *ea,
+ struct obd_trans_info *oti,
+ struct obd_export *md_exp, void *capa)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, destroy);
+ EXP_COUNTER_INCREMENT(exp, destroy);
+
+ rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa);
+ return rc;
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, getattr);
+ EXP_COUNTER_INCREMENT(exp, getattr);
+
+ rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo);
+ return rc;
+}
+
+static inline int obd_getattr_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, getattr_async);
+ EXP_COUNTER_INCREMENT(exp, getattr_async);
+
+ rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
+ return rc;
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, setattr);
+ EXP_COUNTER_INCREMENT(exp, setattr);
+
+ rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti);
+ return rc;
+}
+
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti)
+{
+ struct ptlrpc_request_set *set = NULL;
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, setattr_async);
+ EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ return -ENOMEM;
+
+ rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ return rc;
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+ all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *set)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, setattr_async);
+ EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+ rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+ return rc;
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority)
+{
+ struct obd_device *obd = imp->imp_obd;
+ int rc;
+
+ rc = obd_check_dev_active(obd);
+ if (rc)
+ return rc;
+ OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, add_conn);
+
+ rc = OBP(obd, add_conn)(imp, uuid, priority);
+ return rc;
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+ struct obd_device *obd = imp->imp_obd;
+ int rc;
+
+ rc = obd_check_dev_active(obd);
+ if (rc)
+ return rc;
+ OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, del_conn);
+
+ rc = OBP(obd, del_conn)(imp, uuid);
+ return rc;
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+ struct obd_uuid *uuid;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+ EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+ uuid = OBP(exp->exp_obd, get_uuid)(exp);
+ return uuid;
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ * by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+ struct obd_export **exp, struct obd_device *obd,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *data,
+ void *localdata)
+{
+ int rc;
+ __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+ * check */
+
+ rc = obd_check_dev_active(obd);
+ if (rc)
+ return rc;
+ OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, connect);
+
+ rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+ /* check that only subset is granted */
+ LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+ data->ocd_connect_flags));
+ return rc;
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+ struct obd_export *exp,
+ struct obd_device *obd,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *d,
+ void *localdata)
+{
+ int rc;
+ __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+ * check */
+
+ rc = obd_check_dev_active(obd);
+ if (rc)
+ return rc;
+ OBD_CHECK_DT_OP(obd, reconnect, 0);
+ OBD_COUNTER_INCREMENT(obd, reconnect);
+
+ rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+ /* check that only subset is granted */
+ LASSERT(ergo(d != NULL,
+ (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+ return rc;
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, disconnect);
+ EXP_COUNTER_INCREMENT(exp, disconnect);
+
+ rc = OBP(exp->exp_obd, disconnect)(exp);
+ return rc;
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+ enum lu_cli_type type)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(obd, fid_init, 0);
+ OBD_COUNTER_INCREMENT(obd, fid_init);
+
+ rc = OBP(obd, fid_init)(obd, exp, type);
+ return rc;
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(obd, fid_fini, 0);
+ OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+ rc = OBP(obd, fid_fini)(obd);
+ return rc;
+}
+
+static inline int obd_fid_alloc(struct obd_export *exp,
+ struct lu_fid *fid,
+ struct md_op_data *op_data)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, fid_alloc);
+ EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+ rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data);
+ return rc;
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_new);
+
+ rc = OBP(obd, pool_new)(obd, poolname);
+ return rc;
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_del);
+
+ rc = OBP(obd, pool_del)(obd, poolname);
+ return rc;
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_add);
+
+ rc = OBP(obd, pool_add)(obd, poolname, ostname);
+ return rc;
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+ rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+ return rc;
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+ if (OBT(obd) && OBP(obd, getref)) {
+ OBD_COUNTER_INCREMENT(obd, getref);
+ OBP(obd, getref)(obd);
+ }
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+ if (OBT(obd) && OBP(obd, putref)) {
+ OBD_COUNTER_INCREMENT(obd, putref);
+ OBP(obd, putref)(obd);
+ }
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+ int rc = 0;
+
+ if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+ OBP((exp)->exp_obd, init_export))
+ rc = OBP(exp->exp_obd, init_export)(exp);
+ return rc;
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+ if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+ OBP((exp)->exp_obd, destroy_export))
+ OBP(exp->exp_obd, destroy_export)(exp);
+ return 0;
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+ struct obd_info *oinfo,
+ __u64 max_age,
+ struct ptlrpc_request_set *rqset)
+{
+ int rc = 0;
+ struct obd_device *obd;
+
+ if (exp == NULL || exp->exp_obd == NULL)
+ return -EINVAL;
+
+ obd = exp->exp_obd;
+ OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, statfs);
+
+ CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n",
+ obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+ if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+ rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+ } else {
+ CDEBUG(D_SUPER,
+ "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+ obd->obd_name, &obd->obd_osfs,
+ obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+ obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+ spin_unlock(&obd->obd_osfs_lock);
+ oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+ if (oinfo->oi_cb_up)
+ oinfo->oi_cb_up(oinfo, 0);
+ }
+ return rc;
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age,
+ __u32 flags)
+{
+ struct ptlrpc_request_set *set = NULL;
+ struct obd_info oinfo = { { { 0 } } };
+ int rc = 0;
+
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ return -ENOMEM;
+
+ oinfo.oi_osfs = osfs;
+ oinfo.oi_flags = flags;
+ rc = obd_statfs_async(exp, &oinfo, max_age, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ return rc;
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age,
+ __u32 flags)
+{
+ int rc = 0;
+ struct obd_device *obd = exp->exp_obd;
+
+ if (obd == NULL)
+ return -EINVAL;
+
+ OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+ OBD_COUNTER_INCREMENT(obd, statfs);
+
+ CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n",
+ obd->obd_osfs_age, max_age);
+ if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+ rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+ if (rc == 0) {
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+ obd->obd_osfs_age = cfs_time_current_64();
+ spin_unlock(&obd->obd_osfs_lock);
+ }
+ } else {
+ CDEBUG(D_SUPER, "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n",
+ obd->obd_name, &obd->obd_osfs,
+ obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+ obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+ spin_unlock(&obd->obd_osfs_lock);
+ }
+ return rc;
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa,
+ int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *remote, int *pages,
+ struct niobuf_local *local,
+ struct obd_trans_info *oti,
+ struct lustre_capa *capa)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, preprw);
+ EXP_COUNTER_INCREMENT(exp, preprw);
+
+ rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+ pages, local, oti, capa);
+ return rc;
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+ struct obd_export *exp, struct obdo *oa,
+ int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *rnb, int pages,
+ struct niobuf_local *local,
+ struct obd_trans_info *oti, int rc)
+{
+ EXP_CHECK_DT_OP(exp, commitrw);
+ EXP_COUNTER_INCREMENT(exp, commitrw);
+
+ rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+ rnb, pages, local, oti, rc);
+ return rc;
+}
+
+static inline int obd_adjust_kms(struct obd_export *exp,
+ struct lov_stripe_md *lsm, u64 size,
+ int shrink)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, adjust_kms);
+ EXP_COUNTER_INCREMENT(exp, adjust_kms);
+
+ rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink);
+ return rc;
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+ int len, void *karg, void *uarg)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, iocontrol);
+ EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+ rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+ return rc;
+}
+
+static inline int obd_find_cbdata(struct obd_export *exp,
+ struct lov_stripe_md *lsm,
+ ldlm_iterator_t it, void *data)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, find_cbdata);
+ EXP_COUNTER_INCREMENT(exp, find_cbdata);
+
+ rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data);
+ return rc;
+}
+
+static inline void obd_import_event(struct obd_device *obd,
+ struct obd_import *imp,
+ enum obd_import_event event)
+{
+ if (!obd) {
+ CERROR("NULL device\n");
+ return;
+ }
+ if (obd->obd_set_up && OBP(obd, import_event)) {
+ OBD_COUNTER_INCREMENT(obd, import_event);
+ OBP(obd, import_event)(obd, imp, event);
+ }
+}
+
+static inline int obd_notify(struct obd_device *obd,
+ struct obd_device *watched,
+ enum obd_notify_event ev,
+ void *data)
+{
+ int rc;
+
+ rc = obd_check_dev(obd);
+ if (rc)
+ return rc;
+
+ /* the check for async_recov is a complete hack - I'm hereby
+ overloading the meaning to also mean "this was called from
+ mds_postsetup". I know that my mds is able to handle notifies
+ by this point, and it needs to get them to execute mds_postrecov. */
+ if (!obd->obd_set_up && !obd->obd_async_recov) {
+ CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+ return -EINVAL;
+ }
+
+ if (!OBP(obd, notify)) {
+ CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+ return -ENOSYS;
+ }
+
+ OBD_COUNTER_INCREMENT(obd, notify);
+ rc = OBP(obd, notify)(obd, watched, ev, data);
+ return rc;
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+ struct obd_device *observed,
+ enum obd_notify_event ev,
+ void *data)
+{
+ int rc1;
+ int rc2;
+
+ struct obd_notify_upcall *onu;
+
+ if (observer->obd_observer)
+ rc1 = obd_notify(observer->obd_observer, observed, ev, data);
+ else
+ rc1 = 0;
+ /*
+ * Also, call non-obd listener, if any
+ */
+ onu = &observer->obd_upcall;
+ if (onu->onu_upcall != NULL)
+ rc2 = onu->onu_upcall(observer, observed, ev,
+ onu->onu_owner, NULL);
+ else
+ rc2 = 0;
+
+ return rc1 ? rc1 : rc2;
+}
+
+static inline int obd_quotacheck(struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, quotacheck);
+ EXP_COUNTER_INCREMENT(exp, quotacheck);
+
+ rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl);
+ return rc;
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ int rc;
+
+ EXP_CHECK_DT_OP(exp, quotactl);
+ EXP_COUNTER_INCREMENT(exp, quotactl);
+
+ rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+ return rc;
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+ struct obd_device *obd)
+{
+ /* returns: 0 on healthy
+ * >0 on unhealthy + reason code/flag
+ * however the only supported reason == 1 right now
+ * We'll need to define some better reasons
+ * or flags in the future.
+ * <0 on error
+ */
+ int rc;
+
+ /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+ if (obd == NULL || !OBT(obd)) {
+ CERROR("cleaned up obd\n");
+ return -EOPNOTSUPP;
+ }
+ if (!obd->obd_set_up || obd->obd_stopping)
+ return 0;
+ if (!OBP(obd, health_check))
+ return 0;
+
+ rc = OBP(obd, health_check)(env, obd);
+ return rc;
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+ struct obd_device *observer)
+{
+ int rc;
+
+ rc = obd_check_dev(obd);
+ if (rc)
+ return rc;
+ down_write(&obd->obd_observer_link_sem);
+ if (obd->obd_observer && observer) {
+ up_write(&obd->obd_observer_link_sem);
+ return -EALREADY;
+ }
+ obd->obd_observer = observer;
+ up_write(&obd->obd_observer_link_sem);
+ return 0;
+}
+
+#if 0
+static inline int obd_register_page_removal_cb(struct obd_export *exp,
+ obd_page_removal_cb_t cb,
+ obd_pin_extent_cb pin_cb)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb);
+
+ rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb);
+ return rc;
+}
+
+static inline int obd_unregister_page_removal_cb(struct obd_export *exp,
+ obd_page_removal_cb_t cb)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb);
+
+ rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb);
+ return rc;
+}
+
+static inline int obd_register_lock_cancel_cb(struct obd_export *exp,
+ obd_lock_cancel_cb cb)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb);
+
+ rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb);
+ return rc;
+}
+
+static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
+ obd_lock_cancel_cb cb)
+{
+ int rc;
+
+ OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb);
+
+ rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
+ return rc;
+}
+#endif
+
+/* metadata helpers */
+static inline int md_getstatus(struct obd_export *exp,
+ struct lu_fid *fid, struct obd_capa **pc)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, getstatus);
+ EXP_MD_COUNTER_INCREMENT(exp, getstatus);
+ rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc);
+ return rc;
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, getattr);
+ EXP_MD_COUNTER_INCREMENT(exp, getattr);
+ rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+ return rc;
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+ const struct lu_fid *fid)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, null_inode);
+ EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+ rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+ return rc;
+}
+
+static inline int md_find_cbdata(struct obd_export *exp,
+ const struct lu_fid *fid,
+ ldlm_iterator_t it, void *data)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, find_cbdata);
+ EXP_MD_COUNTER_INCREMENT(exp, find_cbdata);
+ rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data);
+ return rc;
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+ struct md_open_data *mod,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, close);
+ EXP_MD_COUNTER_INCREMENT(exp, close);
+ rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+ return rc;
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+ const void *data, int datalen, int mode, __u32 uid,
+ __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, create);
+ EXP_MD_COUNTER_INCREMENT(exp, create);
+ rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+ uid, gid, cap_effective, rdev, request);
+ return rc;
+}
+
+static inline int md_done_writing(struct obd_export *exp,
+ struct md_op_data *op_data,
+ struct md_open_data *mod)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, done_writing);
+ EXP_MD_COUNTER_INCREMENT(exp, done_writing);
+ rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod);
+ return rc;
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+ struct ldlm_enqueue_info *einfo,
+ struct lookup_intent *it,
+ struct md_op_data *op_data,
+ struct lustre_handle *lockh,
+ void *lmm, int lmmsize,
+ struct ptlrpc_request **req,
+ __u64 extra_lock_flags)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, enqueue);
+ EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+ rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh,
+ lmm, lmmsize, req, extra_lock_flags);
+ return rc;
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+ struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, getattr_name);
+ EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+ rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+ return rc;
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+ struct md_op_data *op_data, void *lmm,
+ int lmmsize, struct lookup_intent *it,
+ int lookup_flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, intent_lock);
+ EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+ rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize,
+ it, lookup_flags, reqp, cb_blocking,
+ extra_lock_flags);
+ return rc;
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, link);
+ EXP_MD_COUNTER_INCREMENT(exp, link);
+ rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+ return rc;
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+ const char *old, int oldlen, const char *new,
+ int newlen, struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, rename);
+ EXP_MD_COUNTER_INCREMENT(exp, rename);
+ rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+ newlen, request);
+ return rc;
+}
+
+static inline int md_is_subdir(struct obd_export *exp,
+ const struct lu_fid *pfid,
+ const struct lu_fid *cfid,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, is_subdir);
+ EXP_MD_COUNTER_INCREMENT(exp, is_subdir);
+ rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request);
+ return rc;
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+ void *ea, int ealen, void *ea2, int ea2len,
+ struct ptlrpc_request **request,
+ struct md_open_data **mod)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, setattr);
+ EXP_MD_COUNTER_INCREMENT(exp, setattr);
+ rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen,
+ ea2, ea2len, request, mod);
+ return rc;
+}
+
+static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, sync);
+ EXP_MD_COUNTER_INCREMENT(exp, sync);
+ rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request);
+ return rc;
+}
+
+static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata,
+ struct page **pages,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, readpage);
+ EXP_MD_COUNTER_INCREMENT(exp, readpage);
+ rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request);
+ return rc;
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, unlink);
+ EXP_MD_COUNTER_INCREMENT(exp, unlink);
+ rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+ return rc;
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ struct obd_export *dt_exp,
+ struct obd_export *md_exp,
+ struct lustre_md *md)
+{
+ EXP_CHECK_MD_OP(exp, get_lustre_md);
+ EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+ return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md);
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+ struct lustre_md *md)
+{
+ EXP_CHECK_MD_OP(exp, free_lustre_md);
+ EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+ return MDP(exp->exp_obd, free_lustre_md)(exp, md);
+}
+
+static inline int md_setxattr(struct obd_export *exp,
+ const struct lu_fid *fid, struct obd_capa *oc,
+ u64 valid, const char *name,
+ const char *input, int input_size,
+ int output_size, int flags, __u32 suppgid,
+ struct ptlrpc_request **request)
+{
+ EXP_CHECK_MD_OP(exp, setxattr);
+ EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+ return MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input,
+ input_size, output_size, flags,
+ suppgid, request);
+}
+
+static inline int md_getxattr(struct obd_export *exp,
+ const struct lu_fid *fid, struct obd_capa *oc,
+ u64 valid, const char *name,
+ const char *input, int input_size,
+ int output_size, int flags,
+ struct ptlrpc_request **request)
+{
+ EXP_CHECK_MD_OP(exp, getxattr);
+ EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+ return MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input,
+ input_size, output_size, flags,
+ request);
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och,
+ struct lookup_intent *it)
+{
+ EXP_CHECK_MD_OP(exp, set_open_replay_data);
+ EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+ return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it);
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och)
+{
+ EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+ EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+ return MDP(exp->exp_obd, clear_open_replay_data)(exp, och);
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+ __u64 *lockh, void *data, __u64 *bits)
+{
+ EXP_CHECK_MD_OP(exp, set_lock_data);
+ EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+ return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits);
+}
+
+static inline int md_cancel_unused(struct obd_export *exp,
+ const struct lu_fid *fid,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags,
+ void *opaque)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, cancel_unused);
+ EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+ rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+ flags, opaque);
+ return rc;
+}
+
+static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags,
+ const struct lu_fid *fid,
+ ldlm_type_t type,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ struct lustre_handle *lockh)
+{
+ EXP_CHECK_MD_OP(exp, lock_match);
+ EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+ return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+ policy, mode, lockh);
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, int easize,
+ int def_asize, int cookiesize,
+ int def_cookiesize)
+{
+ EXP_CHECK_MD_OP(exp, init_ea_size);
+ EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+ return MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize,
+ cookiesize, def_cookiesize);
+}
+
+static inline int md_get_remote_perm(struct obd_export *exp,
+ const struct lu_fid *fid,
+ struct obd_capa *oc, __u32 suppgid,
+ struct ptlrpc_request **request)
+{
+ EXP_CHECK_MD_OP(exp, get_remote_perm);
+ EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm);
+ return MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid,
+ request);
+}
+
+static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa,
+ renew_capa_cb_t cb)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, renew_capa);
+ EXP_MD_COUNTER_INCREMENT(exp, renew_capa);
+ rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb);
+ return rc;
+}
+
+static inline int md_unpack_capa(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ const struct req_msg_field *field,
+ struct obd_capa **oc)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, unpack_capa);
+ EXP_MD_COUNTER_INCREMENT(exp, unpack_capa);
+ rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc);
+ return rc;
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+ struct md_enqueue_info *minfo,
+ struct ldlm_enqueue_info *einfo)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, intent_getattr_async);
+ EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+ rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo);
+ return rc;
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+ struct lookup_intent *it,
+ struct lu_fid *fid, __u64 *bits)
+{
+ int rc;
+
+ EXP_CHECK_MD_OP(exp, revalidate_lock);
+ EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+ rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+ return rc;
+}
+
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr) \
+do { \
+ OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, GFP_NOFS); \
+} while (0)
+
+#define OBDO_FREE(ptr) \
+do { \
+ OBD_SLAB_FREE_PTR((ptr), obdo_cachep); \
+} while (0)
+
+
+static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid)
+{
+ /* something here */
+}
+
+static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa)
+{
+ /* something here */
+}
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+ struct obd_export **lri_exp;
+ register_lwp_cb lri_cb_func;
+ void *lri_cb_data;
+ struct list_head lri_list;
+ char lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+
+/* sysctl.c */
+extern void obd_sysctl_init (void);
+extern void obd_sysctl_clean (void);
+
+/* uuid.c */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
+/* class_obd.c */
+extern char obd_jobid_node[];
+extern struct miscdevice obd_psdev;
+extern spinlock_t obd_types_lock;
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
new file mode 100644
index 000000000..2991d2ee7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/include/obd_support.h
@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/slab.h>
+#include "../../include/linux/libcfs/libcfs.h"
+#include "linux/lustre_compat25.h"
+#include "lprocfs_status.h"
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+ OBD_MEMORY_STAT = 0,
+ OBD_MEMORY_PAGES_STAT = 1,
+ OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+ networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout; /* seconds */
+extern unsigned int ldlm_timeout; /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned int obd_sync_filter;
+extern unsigned int obd_max_dirty_pages;
+extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
+extern unsigned int obd_alloc_fail_rate;
+extern char obd_jobid_var[];
+
+/* lvfs.c */
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+ size_t size, const char *file, int line);
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS 5
+#define HASH_EXP_LOCK_CUR_BITS 7
+#define HASH_EXP_LOCK_MAX_BITS 16
+#define HASH_CL_ENV_BKT_BITS 5
+#define HASH_CL_ENV_BITS 10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT 100
+#define LDLM_TIMEOUT_DEFAULT 20
+#define MDS_LDLM_TIMEOUT_DEFAULT 6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+ connect requests in the LND queues, but within obd_timeout so we don't
+ miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN, obd_timeout))
+#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+ running on a backup server. (If it's too low, import_select_connection
+ will increase the timeout anyhow.) */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+ INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN 1
+#define OBD_IR_FACTOR_MAX 10
+#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT (4*obd_timeout)
+#define LONG_UNLINK 300 /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/
+
+#define OBD_FAIL_MDS 0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101
+#define OBD_FAIL_MDS_GETATTR_NET 0x102
+#define OBD_FAIL_MDS_GETATTR_PACK 0x103
+#define OBD_FAIL_MDS_READPAGE_NET 0x104
+#define OBD_FAIL_MDS_READPAGE_PACK 0x105
+#define OBD_FAIL_MDS_SENDPAGE 0x106
+#define OBD_FAIL_MDS_REINT_NET 0x107
+#define OBD_FAIL_MDS_REINT_UNPACK 0x108
+#define OBD_FAIL_MDS_REINT_SETATTR 0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE 0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK 0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e
+#define OBD_FAIL_MDS_REINT_LINK 0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110
+#define OBD_FAIL_MDS_REINT_RENAME 0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112
+#define OBD_FAIL_MDS_OPEN_NET 0x113
+#define OBD_FAIL_MDS_OPEN_PACK 0x114
+#define OBD_FAIL_MDS_CLOSE_NET 0x115
+#define OBD_FAIL_MDS_CLOSE_PACK 0x116
+#define OBD_FAIL_MDS_CONNECT_NET 0x117
+#define OBD_FAIL_MDS_CONNECT_PACK 0x118
+#define OBD_FAIL_MDS_REINT_NET_REP 0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a
+#define OBD_FAIL_MDS_GETSTATUS_NET 0x11b
+#define OBD_FAIL_MDS_GETSTATUS_PACK 0x11c
+#define OBD_FAIL_MDS_STATFS_PACK 0x11d
+#define OBD_FAIL_MDS_STATFS_NET 0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f
+#define OBD_FAIL_MDS_PIN_NET 0x120
+#define OBD_FAIL_MDS_UNPIN_NET 0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123
+#define OBD_FAIL_MDS_SYNC_NET 0x124
+#define OBD_FAIL_MDS_SYNC_PACK 0x125
+#define OBD_FAIL_MDS_DONE_WRITING_NET 0x126
+#define OBD_FAIL_MDS_DONE_WRITING_PACK 0x127
+#define OBD_FAIL_MDS_ALLOC_OBDO 0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN 0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE 0x12b
+#define OBD_FAIL_MDS_OST_SETATTR 0x12c
+#define OBD_FAIL_MDS_QUOTACHECK_NET 0x12d
+#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD 0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET 0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK 0x131
+#define OBD_FAIL_MDS_SETXATTR_NET 0x132
+#define OBD_FAIL_MDS_SETXATTR 0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134
+#define OBD_FAIL_MDS_FS_SETUP 0x135
+#define OBD_FAIL_MDS_RESEND 0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE 0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141
+#define OBD_FAIL_MDS_REINT_DELAY 0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO 0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144
+#define OBD_FAIL_MDS_PDO_LOCK 0x145
+#define OBD_FAIL_MDS_PDO_LOCK2 0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN 0x171
+#define OBD_FAIL_MDS_LL_BLOCK 0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181
+#define OBD_FAIL_MDS_SET_INFO_NET 0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET 0x186
+#define OBD_FAIL_MDS_DQACQ_NET 0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY 0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH 0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL 0x192
+#define OBD_FAIL_OSD_FID_MAPPING 0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194
+#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY 0x195
+
+#define OBD_FAIL_OST 0x200
+#define OBD_FAIL_OST_CONNECT_NET 0x201
+#define OBD_FAIL_OST_DISCONNECT_NET 0x202
+#define OBD_FAIL_OST_GET_INFO_NET 0x203
+#define OBD_FAIL_OST_CREATE_NET 0x204
+#define OBD_FAIL_OST_DESTROY_NET 0x205
+#define OBD_FAIL_OST_GETATTR_NET 0x206
+#define OBD_FAIL_OST_SETATTR_NET 0x207
+#define OBD_FAIL_OST_OPEN_NET 0x208
+#define OBD_FAIL_OST_CLOSE_NET 0x209
+#define OBD_FAIL_OST_BRW_NET 0x20a
+#define OBD_FAIL_OST_PUNCH_NET 0x20b
+#define OBD_FAIL_OST_STATFS_NET 0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK 0x20f
+#define OBD_FAIL_OST_SYNC_NET 0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET 0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214
+#define OBD_FAIL_OST_ENOSPC 0x215
+#define OBD_FAIL_OST_EROFS 0x216
+#define OBD_FAIL_OST_ENOENT 0x217
+#define OBD_FAIL_OST_QUOTACHECK_NET 0x218
+#define OBD_FAIL_OST_QUOTACTL_NET 0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b
+#define OBD_FAIL_OST_BRW_SIZE 0x21c
+#define OBD_FAIL_OST_DROP_REQ 0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE 0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
+#define OBD_FAIL_OST_CONNECT_NET2 0x225
+#define OBD_FAIL_OST_NOMEM 0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228
+#define OBD_FAIL_OST_ENOINO 0x229
+#define OBD_FAIL_OST_DQACQ_NET 0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231
+
+#define OBD_FAIL_LDLM 0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
+#define OBD_FAIL_LDLM_CONVERT_NET 0x303
+#define OBD_FAIL_LDLM_CANCEL_NET 0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
+#define OBD_FAIL_LDLM_REPLY 0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE 0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE 0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST 0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE 0x318
+#define OBD_FAIL_LDLM_NEW_LOCK 0x319
+#define OBD_FAIL_LDLM_AGL_DELAY 0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b
+#define OBD_FAIL_LDLM_OST_LVB 0x31c
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION 0x385
+
+#define OBD_FAIL_OSC 0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK 0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST 0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST 0x404
+#define OBD_FAIL_OSC_MATCH 0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406
+#define OBD_FAIL_OSC_SHUTDOWN 0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a
+#define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE 0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410
+#define OBD_FAIL_OSC_NO_GRANT 0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME 0x412
+
+#define OBD_FAIL_PTLRPC 0x500
+#define OBD_FAIL_PTLRPC_ACK 0x501
+#define OBD_FAIL_PTLRPC_RQBD 0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC 0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517
+
+#define OBD_FAIL_OBD_PING_NET 0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601
+#define OBD_FAIL_OBD_LOGD_NET 0x602
+#define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603
+#define OBD_FAIL_OBD_DQACQ 0x604
+#define OBD_FAIL_OBD_LLOG_SETUP 0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606
+#define OBD_FAIL_OBD_IDX_READ_NET 0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608
+#define OBD_FAIL_OBD_NO_LRU 0x609
+
+#define OBD_FAIL_TGT_REPLY_NET 0x700
+#define OBD_FAIL_TGT_CONN_RACE 0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT 0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706
+#define OBD_FAIL_TGT_REPLAY_DROP 0x707
+#define OBD_FAIL_TGT_FAKE_EXP 0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY 0x709
+#define OBD_FAIL_TGT_LAST_REPLAY 0x710
+#define OBD_FAIL_TGT_CLIENT_ADD 0x711
+#define OBD_FAIL_TGT_RCVG_FLAG 0x712
+#define OBD_FAIL_TGT_DELAY_CONDITIONAL 0x713
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803
+#define OBD_FAIL_MDC_RPCS_SEM 0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805
+
+#define OBD_FAIL_MGS 0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903
+#define OBD_FAIL_MGS_PAUSE_REQ 0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905
+#define OBD_FAIL_MGS_CONNECT_NET 0x906
+#define OBD_FAIL_MGS_DISCONNECT_NET 0x907
+#define OBD_FAIL_MGS_SET_INFO_NET 0x908
+#define OBD_FAIL_MGS_EXCEPTION_NET 0x909
+#define OBD_FAIL_MGS_TARGET_REG_NET 0x90a
+#define OBD_FAIL_MGS_TARGET_DEL_NET 0x90b
+#define OBD_FAIL_MGS_CONFIG_READ_NET 0x90c
+
+#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01
+#define OBD_FAIL_QUOTA_EDQUOT 0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04
+
+#define OBD_FAIL_LPROC_REMOVE 0xB00
+
+#define OBD_FAIL_GENERAL_ALLOC 0xC00
+
+#define OBD_FAIL_SEQ 0x1000
+#define OBD_FAIL_SEQ_QUERY_NET 0x1001
+#define OBD_FAIL_SEQ_EXHAUST 0x1002
+
+#define OBD_FAIL_FLD 0x1100
+#define OBD_FAIL_FLD_QUERY_NET 0x1101
+
+#define OBD_FAIL_SEC_CTX 0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204
+
+#define OBD_FAIL_LLOG 0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET 0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310
+#define OBD_FAIL_SEQ_ALLOC 0x1311
+
+#define OBD_FAIL_LLITE 0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402
+#define OBD_FAIL_LOV_INIT 0x1403
+#define OBD_FAIL_GLIMPSE_DELAY 0x1404
+#define OBD_FAIL_LLITE_XATTR_ENOMEM 0x1405
+
+#define OBD_FAIL_FID_INDIR 0x1501
+#define OBD_FAIL_FID_INLMA 0x1502
+#define OBD_FAIL_FID_IGIF 0x1504
+#define OBD_FAIL_FID_LOOKUP 0x1505
+#define OBD_FAIL_FID_NOLMA 0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1 0x1600
+#define OBD_FAIL_LFSCK_DELAY2 0x1601
+#define OBD_FAIL_LFSCK_DELAY3 0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604
+#define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605
+#define OBD_FAIL_LFSCK_FATAL1 0x1608
+#define OBD_FAIL_LFSCK_FATAL2 0x1609
+#define OBD_FAIL_LFSCK_CRASH 0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO 0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c
+
+/* UPDATE */
+#define OBD_FAIL_UPDATE_OBJ_NET 0x1700
+#define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701
+
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id) CFS_RACE(id)
+#define OBD_FAIL_ONCE CFS_FAIL_ONCE
+#define OBD_FAILED CFS_FAILED
+
+extern atomic_t libcfs_kmemory;
+
+extern void obd_update_maxusage(void);
+
+#if defined (CONFIG_PROC_FS)
+#define obd_memory_add(size) \
+ lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size) \
+ lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum() \
+ lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT, \
+ LPROCFS_FIELDS_FLAGS_SUM)
+#define obd_pages_add(order) \
+ lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT, \
+ (long)(1 << (order)))
+#define obd_pages_sub(order) \
+ lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT, \
+ (long)(1 << (order)))
+#define obd_pages_sum() \
+ lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT, \
+ LPROCFS_FIELDS_FLAGS_SUM)
+
+extern __u64 obd_memory_max(void);
+extern __u64 obd_pages_max(void);
+
+#else
+
+extern __u64 obd_alloc;
+extern __u64 obd_pages;
+
+extern __u64 obd_max_alloc;
+extern __u64 obd_max_pages;
+
+static inline void obd_memory_add(long size)
+{
+ obd_alloc += size;
+ if (obd_alloc > obd_max_alloc)
+ obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+ obd_alloc -= size;
+}
+
+static inline void obd_pages_add(int order)
+{
+ obd_pages += 1<< order;
+ if (obd_pages > obd_max_pages)
+ obd_max_pages = obd_pages;
+}
+
+static inline void obd_pages_sub(int order)
+{
+ obd_pages -= 1<< order;
+}
+
+#define obd_memory_sum() (obd_alloc)
+#define obd_pages_sum() (obd_pages)
+
+#define obd_memory_max() (obd_max_alloc)
+#define obd_pages_max() (obd_max_pages)
+
+#endif
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name) \
+ obd_memory_add(size); \
+ CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \
+ (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name) \
+ LASSERT(ptr); \
+ obd_memory_sub(size); \
+ CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \
+ (int)(size), ptr); \
+ POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name) ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC)
+
+#define OBD_ALLOC_FAIL_BITS 24
+#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
+#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
+
+#if defined(LUSTRE_UTILS) /* this version is for utils only */
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \
+do { \
+ (ptr) = (cptab) == NULL ? \
+ kmalloc(size, flags) : \
+ kmalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt)); \
+ if (unlikely((ptr) == NULL)) { \
+ CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n", \
+ (int)(size), __FILE__, __LINE__); \
+ } else { \
+ memset(ptr, 0, size); \
+ CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n", \
+ (int)(size), ptr); \
+ } \
+} while (0)
+
+#else /* this version is for the kernel and liblustre */
+#define OBD_FREE_RTN0(ptr) \
+({ \
+ kfree(ptr); \
+ (ptr) = NULL; \
+ 0; \
+})
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \
+do { \
+ (ptr) = (cptab) == NULL ? \
+ kmalloc(size, flags | __GFP_ZERO) : \
+ kmalloc_node(size, flags | __GFP_ZERO, \
+ cfs_cpt_spread_node(cptab, cpt)); \
+ if (likely((ptr) != NULL && \
+ (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \
+ !obd_alloc_fail(ptr, #ptr, "km", size, \
+ __FILE__, __LINE__) || \
+ OBD_FREE_RTN0(ptr)))){ \
+ OBD_ALLOC_POST(ptr, size, "kmalloced"); \
+ } \
+} while (0)
+#endif
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask) \
+ __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_NOFS)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_KERNEL)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof(*(ptr)))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof(*(ptr)))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask) \
+ __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size) \
+ OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt) \
+ OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof(*(ptr)))
+
+# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) \
+do { \
+ (ptr) = cptab == NULL ? \
+ vzalloc(size) : \
+ vzalloc_node(size, cfs_cpt_spread_node(cptab, cpt)); \
+ if (unlikely((ptr) == NULL)) { \
+ CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n", \
+ (int)(size)); \
+ CERROR("%llu total bytes allocated by Lustre, %d by LNET\n", \
+ obd_memory_sum(), atomic_read(&libcfs_kmemory)); \
+ } else { \
+ OBD_ALLOC_POST(ptr, size, "vmalloced"); \
+ } \
+} while (0)
+
+# define OBD_VMALLOC(ptr, size) \
+ __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size)
+# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size) \
+ __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)
+
+
+/* Allocations above this size are considered too big and could not be done
+ * atomically.
+ *
+ * Be very careful when changing this value, especially when decreasing it,
+ * since vmalloc in Linux doesn't perform well on multi-cores system, calling
+ * vmalloc in critical path would hurt performance badly. See LU-66.
+ */
+#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE)
+
+#define OBD_ALLOC_LARGE(ptr, size) \
+do { \
+ if (size > OBD_ALLOC_BIG) \
+ OBD_VMALLOC(ptr, size); \
+ else \
+ OBD_ALLOC(ptr, size); \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size) \
+do { \
+ if (size > OBD_ALLOC_BIG) \
+ OBD_CPT_VMALLOC(ptr, cptab, cpt, size); \
+ else \
+ OBD_CPT_ALLOC(ptr, cptab, cpt, size); \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size) \
+do { \
+ if (size > OBD_ALLOC_BIG) \
+ OBD_VFREE(ptr, size); \
+ else \
+ OBD_FREE(ptr, size); \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr) ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr) (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE); \
+ kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size) \
+do { \
+ OBD_FREE_PRE(ptr, size, "kfreed"); \
+ kfree(ptr); \
+ POISON_PTR(ptr); \
+} while (0)
+
+
+#define OBD_FREE_RCU(ptr, size, handle) \
+do { \
+ struct portals_handle *__h = (handle); \
+ \
+ LASSERT(handle != NULL); \
+ __h->h_cookie = (unsigned long)(ptr); \
+ __h->h_size = (size); \
+ call_rcu(&__h->h_rcu, class_handle_free_cb); \
+ POISON_PTR(ptr); \
+} while (0)
+
+
+#define OBD_VFREE(ptr, size) \
+ do { \
+ OBD_FREE_PRE(ptr, size, "vfreed"); \
+ vfree(ptr); \
+ POISON_PTR(ptr); \
+ } while (0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING. its work will be cleared here. we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab) \
+({ \
+ kmem_cache_free((slab), (ptr)); \
+ (ptr) = NULL; \
+ 0; \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type) \
+do { \
+ LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt())); \
+ (ptr) = (cptab) == NULL ? \
+ kmem_cache_alloc(slab, type | __GFP_ZERO) : \
+ kmem_cache_alloc_node(slab, type | __GFP_ZERO, \
+ cfs_cpt_spread_node(cptab, cpt)); \
+ if (likely((ptr) != NULL && \
+ (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \
+ !obd_alloc_fail(ptr, #ptr, "slab-", size, \
+ __FILE__, __LINE__) || \
+ OBD_SLAB_FREE_RTN0(ptr, slab)))) { \
+ OBD_ALLOC_POST(ptr, size, "slab-alloced"); \
+ } \
+} while (0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags) \
+ __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags) \
+ __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof(*(ptr)))
+
+#define OBD_SLAB_FREE(ptr, slab, size) \
+do { \
+ OBD_FREE_PRE(ptr, size, "slab-freed"); \
+ kmem_cache_free(slab, ptr); \
+ POISON_PTR(ptr); \
+} while (0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size) \
+ OBD_SLAB_ALLOC_GFP(ptr, slab, size, GFP_NOFS)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size) \
+ OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, GFP_NOFS)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab) \
+ OBD_SLAB_ALLOC(ptr, slab, sizeof(*(ptr)))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt) \
+ OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof(*(ptr)))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags) \
+ OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof(*(ptr)), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags) \
+ OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof(*(ptr)), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab) \
+ OBD_SLAB_FREE((ptr), (slab), sizeof(*(ptr)))
+
+#define KEY_IS(str) \
+ (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* Wrapper for contiguous page frame allocation */
+#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) \
+do { \
+ (ptr) = (cptab) == NULL ? \
+ alloc_page(gfp_mask) : \
+ alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), gfp_mask, 0);\
+ if (unlikely((ptr) == NULL)) { \
+ CERROR("alloc_pages of '" #ptr "' %d page(s) / %llu bytes "\
+ "failed\n", (int)1, \
+ (__u64)(1 << PAGE_CACHE_SHIFT)); \
+ CERROR("%llu total bytes and %llu total pages " \
+ "(%llu bytes) allocated by Lustre, " \
+ "%d total bytes by LNET\n", \
+ obd_memory_sum(), \
+ obd_pages_sum() << PAGE_CACHE_SHIFT, \
+ obd_pages_sum(), \
+ atomic_read(&libcfs_kmemory)); \
+ } else { \
+ obd_pages_add(0); \
+ CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / " \
+ "%llu bytes at %p.\n", \
+ (int)1, \
+ (__u64)(1 << PAGE_CACHE_SHIFT), ptr); \
+ } \
+} while (0)
+
+#define OBD_PAGE_ALLOC(ptr, gfp_mask) \
+ __OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask)
+#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask) \
+ __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)
+
+#define OBD_PAGE_FREE(ptr) \
+do { \
+ LASSERT(ptr); \
+ obd_pages_sub(0); \
+ CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / %llu bytes " \
+ "at %p.\n", \
+ (int)1, (__u64)(1 << PAGE_CACHE_SHIFT), \
+ ptr); \
+ __free_page(ptr); \
+ (ptr) = (void *)0xdeadbeef; \
+} while (0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lclient/glimpse.c b/drivers/staging/lustre/lustre/lclient/glimpse.c
new file mode 100644
index 000000000..b9f2bb66d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/glimpse.c
@@ -0,0 +1,269 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_mdc.h"
+#include <linux/pagemap.h>
+#include <linux/file.h>
+
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+#include "../llite/llite_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+ .cld_start = 0,
+ .cld_end = CL_PAGE_EOF,
+ .cld_mode = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwriten pages.
+ *
+ * \retval 1 file is mmap-ed or has dirty pages
+ * 0 otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+ blkcnt_t cnt = 0;
+ struct ccc_object *vob = cl_inode2ccc(inode);
+ void *results[1];
+
+ if (inode->i_mapping != NULL)
+ cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+ results, 0, 1,
+ PAGECACHE_TAG_DIRTY);
+ if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0)
+ cnt = 1;
+
+ return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+ struct inode *inode, struct cl_object *clob, int agl)
+{
+ struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr;
+ struct cl_inode_info *lli = cl_i2info(inode);
+ const struct lu_fid *fid = lu_object_fid(&clob->co_lu);
+ struct ccc_io *cio = ccc_env_io(env);
+ struct cl_lock *lock;
+ int result;
+
+ result = 0;
+ if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) {
+ CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+ if (lli->lli_has_smd) {
+ /* NOTE: this looks like DLM lock request, but it may
+ * not be one. Due to CEF_ASYNC flag (translated
+ * to LDLM_FL_HAS_INTENT by osc), this is
+ * glimpse request, that won't revoke any
+ * conflicting DLM locks held. Instead,
+ * ll_glimpse_callback() will be called on each
+ * client holding a DLM lock against this file,
+ * and resulting size will be returned for each
+ * stripe. DLM lock on [0, EOF] is acquired only
+ * if there were no conflicting locks. If there
+ * were conflicting locks, enqueuing or waiting
+ * fails with -ENAVAIL, but valid inode
+ * attributes are returned anyway. */
+ *descr = whole_file;
+ descr->cld_obj = clob;
+ descr->cld_mode = CLM_PHANTOM;
+ descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+ if (agl)
+ descr->cld_enq_flags |= CEF_AGL;
+ cio->cui_glimpse = 1;
+ /*
+ * CEF_ASYNC is used because glimpse sub-locks cannot
+ * deadlock (because they never conflict with other
+ * locks) and, hence, can be enqueued out-of-order.
+ *
+ * CEF_MUST protects glimpse lock from conversion into
+ * a lockless mode.
+ */
+ lock = cl_lock_request(env, io, descr, "glimpse",
+ current);
+ cio->cui_glimpse = 0;
+
+ if (lock == NULL)
+ return 0;
+
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ LASSERT(agl == 0);
+ result = cl_wait(env, lock);
+ if (result == 0) {
+ cl_merge_lvb(env, inode);
+ if (cl_isize_read(inode) > 0 &&
+ inode->i_blocks == 0) {
+ /*
+ * LU-417: Add dirty pages block count
+ * lest i_blocks reports 0, some "cp" or
+ * "tar" may think it's a completely
+ * sparse file and skip it.
+ */
+ inode->i_blocks = dirty_cnt(inode);
+ }
+ cl_unuse(env, lock);
+ }
+ cl_lock_release(env, lock, "glimpse", current);
+ } else {
+ CDEBUG(D_DLMTRACE, "No objects for inode\n");
+ cl_merge_lvb(env, inode);
+ }
+ }
+
+ return result;
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+ struct cl_io **ioout, int *refcheck)
+{
+ struct lu_env *env;
+ struct cl_io *io;
+ struct cl_inode_info *lli = cl_i2info(inode);
+ struct cl_object *clob = lli->lli_clob;
+ int result;
+
+ if (S_ISREG(cl_inode_mode(inode))) {
+ env = cl_env_get(refcheck);
+ if (!IS_ERR(env)) {
+ io = ccc_env_thread_io(env);
+ io->ci_obj = clob;
+ *envout = env;
+ *ioout = io;
+ result = 1;
+ } else
+ result = PTR_ERR(env);
+ } else
+ result = 0;
+ return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+ /*
+ * We don't need ast_flags argument to cl_glimpse_size(), because
+ * osc_lock_enqueue() takes care of the possible deadlock that said
+ * argument was introduced to avoid.
+ */
+ /*
+ * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+ * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+ * blocking anyway.
+ */
+ struct lu_env *env = NULL;
+ struct cl_io *io = NULL;
+ int result;
+ int refcheck;
+
+ result = cl_io_get(inode, &env, &io, &refcheck);
+ if (result > 0) {
+again:
+ io->ci_verify_layout = 1;
+ result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+ if (result > 0)
+ /*
+ * nothing to do for this io. This currently happens
+ * when stripe sub-object's are not yet created.
+ */
+ result = io->ci_result;
+ else if (result == 0)
+ result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+ agl);
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
+ cl_io_fini(env, io);
+ if (unlikely(io->ci_need_restart))
+ goto again;
+ cl_env_put(env, &refcheck);
+ }
+ return result;
+}
+
+int cl_local_size(struct inode *inode)
+{
+ struct lu_env *env = NULL;
+ struct cl_io *io = NULL;
+ struct ccc_thread_info *cti;
+ struct cl_object *clob;
+ struct cl_lock_descr *descr;
+ struct cl_lock *lock;
+ int result;
+ int refcheck;
+
+ if (!cl_i2info(inode)->lli_has_smd)
+ return 0;
+
+ result = cl_io_get(inode, &env, &io, &refcheck);
+ if (result <= 0)
+ return result;
+
+ clob = io->ci_obj;
+ result = cl_io_init(env, io, CIT_MISC, clob);
+ if (result > 0)
+ result = io->ci_result;
+ else if (result == 0) {
+ cti = ccc_env_info(env);
+ descr = &cti->cti_descr;
+
+ *descr = whole_file;
+ descr->cld_obj = clob;
+ lock = cl_lock_peek(env, io, descr, "localsize", current);
+ if (lock != NULL) {
+ cl_merge_lvb(env, inode);
+ cl_unuse(env, lock);
+ cl_lock_release(env, lock, "localsize", current);
+ result = 0;
+ } else
+ result = -ENODATA;
+ }
+ cl_io_fini(env, io);
+ cl_env_put(env, &refcheck);
+ return result;
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
new file mode 100644
index 000000000..ab6cb4193
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
@@ -0,0 +1,1287 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../../include/linux/libcfs/libcfs.h"
+# include <linux/fs.h>
+# include <linux/sched.h>
+# include <linux/mm.h>
+# include <linux/quotaops.h>
+# include <linux/highmem.h>
+# include <linux/pagemap.h>
+# include <linux/rbtree.h>
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "../include/lustre_mdc.h"
+#include "../include/cl_object.h"
+
+#include "../include/lclient.h"
+
+#include "../llite/llite_internal.h"
+
+static const struct cl_req_operations ccc_req_ops;
+
+/*
+ * ccc_ prefix stands for "Common Client Code".
+ */
+
+static struct kmem_cache *ccc_lock_kmem;
+static struct kmem_cache *ccc_object_kmem;
+static struct kmem_cache *ccc_thread_kmem;
+static struct kmem_cache *ccc_session_kmem;
+static struct kmem_cache *ccc_req_kmem;
+
+static struct lu_kmem_descr ccc_caches[] = {
+ {
+ .ckd_cache = &ccc_lock_kmem,
+ .ckd_name = "ccc_lock_kmem",
+ .ckd_size = sizeof(struct ccc_lock)
+ },
+ {
+ .ckd_cache = &ccc_object_kmem,
+ .ckd_name = "ccc_object_kmem",
+ .ckd_size = sizeof(struct ccc_object)
+ },
+ {
+ .ckd_cache = &ccc_thread_kmem,
+ .ckd_name = "ccc_thread_kmem",
+ .ckd_size = sizeof(struct ccc_thread_info),
+ },
+ {
+ .ckd_cache = &ccc_session_kmem,
+ .ckd_name = "ccc_session_kmem",
+ .ckd_size = sizeof(struct ccc_session)
+ },
+ {
+ .ckd_cache = &ccc_req_kmem,
+ .ckd_name = "ccc_req_kmem",
+ .ckd_size = sizeof(struct ccc_req)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+void *ccc_key_init(const struct lu_context *ctx, struct lu_context_key *key)
+{
+ struct ccc_thread_info *info;
+
+ OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, GFP_NOFS);
+ if (info == NULL)
+ info = ERR_PTR(-ENOMEM);
+ return info;
+}
+
+void ccc_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct ccc_thread_info *info = data;
+
+ OBD_SLAB_FREE_PTR(info, ccc_thread_kmem);
+}
+
+void *ccc_session_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct ccc_session *session;
+
+ OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, GFP_NOFS);
+ if (session == NULL)
+ session = ERR_PTR(-ENOMEM);
+ return session;
+}
+
+void ccc_session_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct ccc_session *session = data;
+
+ OBD_SLAB_FREE_PTR(session, ccc_session_kmem);
+}
+
+struct lu_context_key ccc_key = {
+ .lct_tags = LCT_CL_THREAD,
+ .lct_init = ccc_key_init,
+ .lct_fini = ccc_key_fini
+};
+
+struct lu_context_key ccc_session_key = {
+ .lct_tags = LCT_SESSION,
+ .lct_init = ccc_session_key_init,
+ .lct_fini = ccc_session_key_fini
+};
+
+
+/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */
+/* LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key); */
+
+int ccc_device_init(const struct lu_env *env, struct lu_device *d,
+ const char *name, struct lu_device *next)
+{
+ struct ccc_device *vdv;
+ int rc;
+
+ vdv = lu2ccc_dev(d);
+ vdv->cdv_next = lu2cl_dev(next);
+
+ LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+ next->ld_site = d->ld_site;
+ rc = next->ld_type->ldt_ops->ldto_device_init(
+ env, next, next->ld_type->ldt_name, NULL);
+ if (rc == 0) {
+ lu_device_get(next);
+ lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+ }
+ return rc;
+}
+
+struct lu_device *ccc_device_fini(const struct lu_env *env,
+ struct lu_device *d)
+{
+ return cl2lu_dev(lu2ccc_dev(d)->cdv_next);
+}
+
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg,
+ const struct lu_device_operations *luops,
+ const struct cl_device_operations *clops)
+{
+ struct ccc_device *vdv;
+ struct lu_device *lud;
+ struct cl_site *site;
+ int rc;
+
+ OBD_ALLOC_PTR(vdv);
+ if (vdv == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ lud = &vdv->cdv_cl.cd_lu_dev;
+ cl_device_init(&vdv->cdv_cl, t);
+ ccc2lu_dev(vdv)->ld_ops = luops;
+ vdv->cdv_cl.cd_ops = clops;
+
+ OBD_ALLOC_PTR(site);
+ if (site != NULL) {
+ rc = cl_site_init(site, &vdv->cdv_cl);
+ if (rc == 0)
+ rc = lu_site_init_finish(&site->cs_lu);
+ else {
+ LASSERT(lud->ld_site == NULL);
+ CERROR("Cannot init lu_site, rc %d.\n", rc);
+ OBD_FREE_PTR(site);
+ }
+ } else
+ rc = -ENOMEM;
+ if (rc != 0) {
+ ccc_device_free(env, lud);
+ lud = ERR_PTR(rc);
+ }
+ return lud;
+}
+
+struct lu_device *ccc_device_free(const struct lu_env *env,
+ struct lu_device *d)
+{
+ struct ccc_device *vdv = lu2ccc_dev(d);
+ struct cl_site *site = lu2cl_site(d->ld_site);
+ struct lu_device *next = cl2lu_dev(vdv->cdv_next);
+
+ if (d->ld_site != NULL) {
+ cl_site_fini(site);
+ OBD_FREE_PTR(site);
+ }
+ cl_device_fini(lu2cl_dev(d));
+ OBD_FREE_PTR(vdv);
+ return next;
+}
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req)
+{
+ struct ccc_req *vrq;
+ int result;
+
+ OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, GFP_NOFS);
+ if (vrq != NULL) {
+ cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+/**
+ * An `emergency' environment used by ccc_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by ccc_inode_fini_guard
+ * mutex.
+ */
+static struct lu_env *ccc_inode_fini_env;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(ccc_inode_fini_guard);
+static int dummy_refcheck;
+
+int ccc_global_init(struct lu_device_type *device_type)
+{
+ int result;
+
+ result = lu_kmem_init(ccc_caches);
+ if (result)
+ return result;
+
+ result = lu_device_type_init(device_type);
+ if (result)
+ goto out_kmem;
+
+ ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck,
+ LCT_REMEMBER|LCT_NOREF);
+ if (IS_ERR(ccc_inode_fini_env)) {
+ result = PTR_ERR(ccc_inode_fini_env);
+ goto out_device;
+ }
+
+ ccc_inode_fini_env->le_ctx.lc_cookie = 0x4;
+ return 0;
+out_device:
+ lu_device_type_fini(device_type);
+out_kmem:
+ lu_kmem_fini(ccc_caches);
+ return result;
+}
+
+void ccc_global_fini(struct lu_device_type *device_type)
+{
+ if (ccc_inode_fini_env != NULL) {
+ cl_env_put(ccc_inode_fini_env, &dummy_refcheck);
+ ccc_inode_fini_env = NULL;
+ }
+ lu_device_type_fini(device_type);
+ lu_kmem_fini(ccc_caches);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *unused,
+ struct lu_device *dev,
+ const struct cl_object_operations *clops,
+ const struct lu_object_operations *luops)
+{
+ struct ccc_object *vob;
+ struct lu_object *obj;
+
+ OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, GFP_NOFS);
+ if (vob != NULL) {
+ struct cl_object_header *hdr;
+
+ obj = ccc2lu(vob);
+ hdr = &vob->cob_header;
+ cl_object_header_init(hdr);
+ lu_object_init(obj, &hdr->coh_lu, dev);
+ lu_object_add_top(&hdr->coh_lu, obj);
+
+ vob->cob_cl.co_ops = clops;
+ obj->lo_ops = luops;
+ } else
+ obj = NULL;
+ return obj;
+}
+
+int ccc_object_init0(const struct lu_env *env,
+ struct ccc_object *vob,
+ const struct cl_object_conf *conf)
+{
+ vob->cob_inode = conf->coc_inode;
+ vob->cob_transient_pages = 0;
+ cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page));
+ return 0;
+}
+
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf)
+{
+ struct ccc_device *dev = lu2ccc_dev(obj->lo_dev);
+ struct ccc_object *vob = lu2ccc(obj);
+ struct lu_object *below;
+ struct lu_device *under;
+ int result;
+
+ under = &dev->cdv_next->cd_lu_dev;
+ below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+ if (below != NULL) {
+ const struct cl_object_conf *cconf;
+
+ cconf = lu2cl_conf(conf);
+ INIT_LIST_HEAD(&vob->cob_pending_list);
+ lu_object_add(obj, below);
+ result = ccc_object_init0(env, vob, cconf);
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+ struct ccc_object *vob = lu2ccc(obj);
+
+ lu_object_fini(obj);
+ lu_object_header_fini(obj->lo_header);
+ OBD_SLAB_FREE_PTR(vob, ccc_object_kmem);
+}
+
+int ccc_lock_init(const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *unused,
+ const struct cl_lock_operations *lkops)
+{
+ struct ccc_lock *clk;
+ int result;
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+ OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, GFP_NOFS);
+ if (clk != NULL) {
+ cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid)
+{
+ return 0;
+}
+
+int ccc_object_glimpse(const struct lu_env *env,
+ const struct cl_object *obj, struct ost_lvb *lvb)
+{
+ struct inode *inode = ccc_object_inode(obj);
+
+ lvb->lvb_mtime = cl_inode_mtime(inode);
+ lvb->lvb_atime = cl_inode_atime(inode);
+ lvb->lvb_ctime = cl_inode_ctime(inode);
+ /*
+ * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+ * "cp" or "tar" on remote node may think it's a completely sparse file
+ * and skip it.
+ */
+ if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+ lvb->lvb_blocks = dirty_cnt(inode);
+ return 0;
+}
+
+
+
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf)
+{
+ /* TODO: destroy all pages attached to this object. */
+ return 0;
+}
+
+static void ccc_object_size_lock(struct cl_object *obj)
+{
+ struct inode *inode = ccc_object_inode(obj);
+
+ cl_isize_lock(inode);
+ cl_object_attr_lock(obj);
+}
+
+static void ccc_object_size_unlock(struct cl_object *obj)
+{
+ struct inode *inode = ccc_object_inode(obj);
+
+ cl_object_attr_unlock(obj);
+ cl_isize_unlock(inode);
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+struct page *ccc_page_vmpage(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ return cl2vm_page(slice);
+}
+
+int ccc_page_is_under_lock(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io)
+{
+ struct ccc_io *cio = ccc_env_io(env);
+ struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
+ struct cl_page *page = slice->cpl_page;
+
+ int result;
+
+ if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+ io->ci_type == CIT_FAULT) {
+ if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+ result = -EBUSY;
+ else {
+ desc->cld_start = page->cp_index;
+ desc->cld_end = page->cp_index;
+ desc->cld_obj = page->cp_obj;
+ desc->cld_mode = CLM_READ;
+ result = cl_queue_match(&io->ci_lockset.cls_done,
+ desc) ? -EBUSY : 0;
+ }
+ } else
+ result = 0;
+ return result;
+}
+
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice)
+{
+ /*
+ * Cached read?
+ */
+ LBUG();
+ return 0;
+}
+
+void ccc_transient_page_verify(const struct cl_page *page)
+{
+}
+
+int ccc_transient_page_own(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused,
+ int nonblock)
+{
+ ccc_transient_page_verify(slice->cpl_page);
+ return 0;
+}
+
+void ccc_transient_page_assume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_unassume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_disown(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_discard(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct cl_page *page = slice->cpl_page;
+
+ ccc_transient_page_verify(slice->cpl_page);
+
+ /*
+ * For transient pages, remove it from the radix tree.
+ */
+ cl_page_delete(env, page);
+}
+
+int ccc_transient_page_prep(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ /* transient page should always be sent. */
+ return 0;
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void ccc_lock_delete(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+}
+
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+ struct ccc_lock *clk = cl2ccc_lock(slice);
+
+ OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem);
+}
+
+int ccc_lock_enqueue(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_io *unused, __u32 enqflags)
+{
+ CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+ return 0;
+}
+
+int ccc_lock_use(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+ CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+ return 0;
+}
+
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+ CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+ return 0;
+}
+
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+ CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+ return 0;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_fits_into() methods for ccc
+ * layer. This function is executed every time io finds an existing lock in
+ * the lock cache while creating new lock. This function has to decide whether
+ * cached lock "fits" into io.
+ *
+ * \param slice lock to be checked
+ * \param io IO that wants a lock.
+ *
+ * \see lov_lock_fits_into().
+ */
+int ccc_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io)
+{
+ const struct cl_lock *lock = slice->cls_lock;
+ const struct cl_lock_descr *descr = &lock->cll_descr;
+ const struct ccc_io *cio = ccc_env_io(env);
+ int result;
+
+ /*
+ * Work around DLM peculiarity: it assumes that glimpse
+ * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock
+ * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make
+ * sure that glimpse doesn't get CLM_WRITE top-lock, so that it
+ * doesn't enqueue CLM_WRITE sub-locks.
+ */
+ if (cio->cui_glimpse)
+ result = descr->cld_mode != CLM_WRITE;
+
+ /*
+ * Also, don't match incomplete write locks for read, otherwise read
+ * would enqueue missing sub-locks in the write mode.
+ */
+ else if (need->cld_mode != descr->cld_mode)
+ result = lock->cll_state >= CLS_ENQUEUED;
+ else
+ result = 1;
+ return result;
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for ccc layer, invoked
+ * whenever lock state changes. Transfers object attributes, that might be
+ * updated as a result of lock acquiring into inode.
+ */
+void ccc_lock_state(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state state)
+{
+ struct cl_lock *lock = slice->cls_lock;
+
+ /*
+ * Refresh inode attributes when the lock is moving into CLS_HELD
+ * state, and only when this is a result of real enqueue, rather than
+ * of finding lock in the cache.
+ */
+ if (state == CLS_HELD && lock->cll_state < CLS_HELD) {
+ struct cl_object *obj;
+ struct inode *inode;
+
+ obj = slice->cls_obj;
+ inode = ccc_object_inode(obj);
+
+ /* vmtruncate() sets the i_size
+ * under both a DLM lock and the
+ * ll_inode_size_lock(). If we don't get the
+ * ll_inode_size_lock() here we can match the DLM lock and
+ * reset i_size. generic_file_write can then trust the
+ * stale i_size when doing appending writes and effectively
+ * cancel the result of the truncate. Getting the
+ * ll_inode_size_lock() after the enqueue maintains the DLM
+ * -> ll_inode_size_lock() acquiring order. */
+ if (lock->cll_descr.cld_start == 0 &&
+ lock->cll_descr.cld_end == CL_PAGE_EOF)
+ cl_merge_lvb(env, inode);
+ }
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+
+ CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+}
+
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+ __u32 enqflags, enum cl_lock_mode mode,
+ pgoff_t start, pgoff_t end)
+{
+ struct ccc_io *cio = ccc_env_io(env);
+ struct cl_lock_descr *descr = &cio->cui_link.cill_descr;
+ struct cl_object *obj = io->ci_obj;
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+ CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+ memset(&cio->cui_link, 0, sizeof(cio->cui_link));
+
+ if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+ descr->cld_mode = CLM_GROUP;
+ descr->cld_gid = cio->cui_fd->fd_grouplock.cg_gid;
+ } else {
+ descr->cld_mode = mode;
+ }
+ descr->cld_obj = obj;
+ descr->cld_start = start;
+ descr->cld_end = end;
+ descr->cld_enq_flags = enqflags;
+
+ cl_io_lock_add(env, io, &cio->cui_link);
+ return 0;
+}
+
+void ccc_io_update_iov(const struct lu_env *env,
+ struct ccc_io *cio, struct cl_io *io)
+{
+ size_t size = io->u.ci_rw.crw_count;
+
+ if (!cl_is_normalio(env, io) || cio->cui_iter == NULL)
+ return;
+
+ iov_iter_truncate(cio->cui_iter, size);
+}
+
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+ __u32 enqflags, enum cl_lock_mode mode,
+ loff_t start, loff_t end)
+{
+ struct cl_object *obj = io->ci_obj;
+
+ return ccc_io_one_lock_index(env, io, enqflags, mode,
+ cl_index(obj, start), cl_index(obj, end));
+}
+
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ CLOBINVRNT(env, ios->cis_io->ci_obj,
+ ccc_object_invariant(ios->cis_io->ci_obj));
+}
+
+void ccc_io_advance(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ size_t nob)
+{
+ struct ccc_io *cio = cl2ccc_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = ios->cis_io->ci_obj;
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+ if (!cl_is_normalio(env, io))
+ return;
+
+ iov_iter_reexpand(cio->cui_iter, cio->cui_tot_count -= nob);
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: cl_isize_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io, loff_t start, size_t count, int *exceed)
+{
+ struct cl_attr *attr = ccc_env_thread_attr(env);
+ struct inode *inode = ccc_object_inode(obj);
+ loff_t pos = start + count - 1;
+ loff_t kms;
+ int result;
+
+ /*
+ * Consistency guarantees: following possibilities exist for the
+ * relation between region being accessed and real file size at this
+ * moment:
+ *
+ * (A): the region is completely inside of the file;
+ *
+ * (B-x): x bytes of region are inside of the file, the rest is
+ * outside;
+ *
+ * (C): the region is completely outside of the file.
+ *
+ * This classification is stable under DLM lock already acquired by
+ * the caller, because to change the class, other client has to take
+ * DLM lock conflicting with our lock. Also, any updates to ->i_size
+ * by other threads on this client are serialized by
+ * ll_inode_size_lock(). This guarantees that short reads are handled
+ * correctly in the face of concurrent writes and truncates.
+ */
+ ccc_object_size_lock(obj);
+ result = cl_object_attr_get(env, obj, attr);
+ if (result == 0) {
+ kms = attr->cat_kms;
+ if (pos > kms) {
+ /*
+ * A glimpse is necessary to determine whether we
+ * return a short read (B) or some zeroes at the end
+ * of the buffer (C)
+ */
+ ccc_object_size_unlock(obj);
+ result = cl_glimpse_lock(env, io, inode, obj, 0);
+ if (result == 0 && exceed != NULL) {
+ /* If objective page index exceed end-of-file
+ * page index, return directly. Do not expect
+ * kernel will check such case correctly.
+ * linux-2.6.18-128.1.1 miss to do that.
+ * --bug 17336 */
+ loff_t size = cl_isize_read(inode);
+ loff_t cur_index = start >> PAGE_CACHE_SHIFT;
+ loff_t size_index = (size - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if ((size == 0 && cur_index != 0) ||
+ size_index < cur_index)
+ *exceed = 1;
+ }
+ return result;
+ } else {
+ /*
+ * region is within kms and, hence, within real file
+ * size (A). We need to increase i_size to cover the
+ * read region so that generic_file_read() will do its
+ * job, but that doesn't mean the kms size is
+ * _correct_, it is only the _minimum_ size. If
+ * someone does a stat they will get the correct size
+ * which will always be >= the kms value here.
+ * b=11081
+ */
+ if (cl_isize_read(inode) < kms) {
+ cl_isize_write_nolock(inode, kms);
+ CDEBUG(D_VFSTRACE,
+ DFID" updating i_size %llu\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ (__u64)cl_isize_read(inode));
+
+ }
+ }
+ }
+ ccc_object_size_unlock(obj);
+ return result;
+}
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+void ccc_req_completion(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret)
+{
+ struct ccc_req *vrq;
+
+ if (ioret > 0)
+ cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret);
+
+ vrq = cl2ccc_req(slice);
+ OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for ccc
+ * layer. ccc is responsible for
+ *
+ * - o_[mac]time
+ *
+ * - o_mode
+ *
+ * - o_parent_seq
+ *
+ * - o_[ug]id
+ *
+ * - o_parent_oid
+ *
+ * - o_parent_ver
+ *
+ * - o_ioepoch,
+ *
+ * and capability.
+ */
+void ccc_req_attr_set(const struct lu_env *env,
+ const struct cl_req_slice *slice,
+ const struct cl_object *obj,
+ struct cl_req_attr *attr, u64 flags)
+{
+ struct inode *inode;
+ struct obdo *oa;
+ u32 valid_flags;
+
+ oa = attr->cra_oa;
+ inode = ccc_object_inode(obj);
+ valid_flags = OBD_MD_FLTYPE;
+
+ if ((flags & OBD_MD_FLOSSCAPA) != 0) {
+ LASSERT(attr->cra_capa == NULL);
+ attr->cra_capa = cl_capa_lookup(inode,
+ slice->crs_req->crq_type);
+ }
+
+ if (slice->crs_req->crq_type == CRT_WRITE) {
+ if (flags & OBD_MD_FLEPOCH) {
+ oa->o_valid |= OBD_MD_FLEPOCH;
+ oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch;
+ valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+ OBD_MD_FLUID | OBD_MD_FLGID;
+ }
+ }
+ obdo_from_inode(oa, inode, valid_flags & flags);
+ obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid);
+ memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid,
+ JOBSTATS_JOBID_SIZE);
+}
+
+static const struct cl_req_operations ccc_req_ops = {
+ .cro_attr_set = ccc_req_attr_set,
+ .cro_completion = ccc_req_completion
+};
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+ struct obd_capa *capa)
+{
+ struct lu_env *env;
+ struct cl_io *io;
+ int result;
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ io = ccc_env_thread_io(env);
+ io->ci_obj = cl_i2info(inode)->lli_clob;
+
+ io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
+ io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
+ io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+ io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+ io->u.ci_setattr.sa_valid = attr->ia_valid;
+ io->u.ci_setattr.sa_capa = capa;
+
+again:
+ if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+ struct ccc_io *cio = ccc_env_io(env);
+
+ if (attr->ia_valid & ATTR_FILE)
+ /* populate the file descriptor for ftruncate to honor
+ * group lock - see LU-787 */
+ cio->cui_fd = cl_iattr2fd(inode, attr);
+
+ result = cl_io_loop(env, io);
+ } else {
+ result = io->ci_result;
+ }
+ cl_io_fini(env, io);
+ if (unlikely(io->ci_need_restart))
+ goto again;
+ /* HSM import case: file is released, cannot be restored
+ * no need to fail except if restore registration failed
+ * with -ENODATA */
+ if (result == -ENODATA && io->ci_restore_needed &&
+ io->ci_result != -ENODATA)
+ result = 0;
+ cl_env_put(env, &refcheck);
+ return result;
+}
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+struct lu_device *ccc2lu_dev(struct ccc_device *vdv)
+{
+ return &vdv->cdv_cl.cd_lu_dev;
+}
+
+struct ccc_device *lu2ccc_dev(const struct lu_device *d)
+{
+ return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev);
+}
+
+struct ccc_device *cl2ccc_dev(const struct cl_device *d)
+{
+ return container_of0(d, struct ccc_device, cdv_cl);
+}
+
+struct lu_object *ccc2lu(struct ccc_object *vob)
+{
+ return &vob->cob_cl.co_lu;
+}
+
+struct ccc_object *lu2ccc(const struct lu_object *obj)
+{
+ return container_of0(obj, struct ccc_object, cob_cl.co_lu);
+}
+
+struct ccc_object *cl2ccc(const struct cl_object *obj)
+{
+ return container_of0(obj, struct ccc_object, cob_cl);
+}
+
+struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice)
+{
+ return container_of(slice, struct ccc_lock, clk_cl);
+}
+
+struct ccc_io *cl2ccc_io(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct ccc_io *cio;
+
+ cio = container_of(slice, struct ccc_io, cui_cl);
+ LASSERT(cio == ccc_env_io(env));
+ return cio;
+}
+
+struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice)
+{
+ return container_of0(slice, struct ccc_req, crq_cl);
+}
+
+struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+ return cl2ccc_page(slice)->cpg_page;
+}
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+int ccc_object_invariant(const struct cl_object *obj)
+{
+ struct inode *inode = ccc_object_inode(obj);
+ struct cl_inode_info *lli = cl_i2info(inode);
+
+ return (S_ISREG(cl_inode_mode(inode)) ||
+ /* i_mode of unlinked inode is zeroed. */
+ cl_inode_mode(inode) == 0) && lli->lli_clob == obj;
+}
+
+struct inode *ccc_object_inode(const struct cl_object *obj)
+{
+ return cl2ccc(obj)->cob_inode;
+}
+
+/**
+ * Returns a pointer to cl_page associated with \a vmpage, without acquiring
+ * additional reference to the resulting page. This is an unsafe version of
+ * cl_vmpage_page() that can only be used under vmpage lock.
+ */
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage)
+{
+ KLASSERT(PageLocked(vmpage));
+ return (struct cl_page *)vmpage->private;
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+ struct lu_env *env;
+ struct cl_inode_info *lli;
+ struct cl_object *clob;
+ struct lu_site *site;
+ struct lu_fid *fid;
+ struct cl_object_conf conf = {
+ .coc_inode = inode,
+ .u = {
+ .coc_md = md
+ }
+ };
+ int result = 0;
+ int refcheck;
+
+ LASSERT(md->body->valid & OBD_MD_FLID);
+ LASSERT(S_ISREG(cl_inode_mode(inode)));
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ site = cl_i2sbi(inode)->ll_site;
+ lli = cl_i2info(inode);
+ fid = &lli->lli_fid;
+ LASSERT(fid_is_sane(fid));
+
+ if (lli->lli_clob == NULL) {
+ /* clob is slave of inode, empty lli_clob means for new inode,
+ * there is no clob in cache with the given fid, so it is
+ * unnecessary to perform lookup-alloc-lookup-insert, just
+ * alloc and insert directly. */
+ LASSERT(inode->i_state & I_NEW);
+ conf.coc_lu.loc_flags = LOC_F_NEW;
+ clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+ fid, &conf);
+ if (!IS_ERR(clob)) {
+ /*
+ * No locking is necessary, as new inode is
+ * locked by I_NEW bit.
+ */
+ lli->lli_clob = clob;
+ lli->lli_has_smd = lsm_has_objects(md->lsm);
+ lu_object_ref_add(&clob->co_lu, "inode", inode);
+ } else
+ result = PTR_ERR(clob);
+ } else {
+ result = cl_conf_set(env, lli->lli_clob, &conf);
+ }
+
+ cl_env_put(env, &refcheck);
+
+ if (result != 0)
+ CERROR("Failure to initialize cl object "DFID": %d\n",
+ PFID(fid), result);
+ return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+ struct lu_object_header *header = obj->co_lu.lo_header;
+ wait_queue_t waiter;
+
+ if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+ struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+ struct lu_site_bkt_data *bkt;
+
+ bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+
+ init_waitqueue_entry(&waiter, current);
+ add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&header->loh_ref) == 1)
+ break;
+ schedule();
+ }
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+ }
+
+ cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+ struct lu_env *env;
+ struct cl_inode_info *lli = cl_i2info(inode);
+ struct cl_object *clob = lli->lli_clob;
+ int refcheck;
+ int emergency;
+
+ if (clob != NULL) {
+ void *cookie;
+
+ cookie = cl_env_reenter();
+ env = cl_env_get(&refcheck);
+ emergency = IS_ERR(env);
+ if (emergency) {
+ mutex_lock(&ccc_inode_fini_guard);
+ LASSERT(ccc_inode_fini_env != NULL);
+ cl_env_implant(ccc_inode_fini_env, &refcheck);
+ env = ccc_inode_fini_env;
+ }
+ /*
+ * cl_object cache is a slave to inode cache (which, in turn
+ * is a slave to dentry cache), don't keep cl_object in memory
+ * when its master is evicted.
+ */
+ cl_object_kill(env, clob);
+ lu_object_ref_del(&clob->co_lu, "inode", inode);
+ cl_object_put_last(env, clob);
+ lli->lli_clob = NULL;
+ if (emergency) {
+ cl_env_unplant(ccc_inode_fini_env, &refcheck);
+ mutex_unlock(&ccc_inode_fini_guard);
+ } else
+ cl_env_put(env, &refcheck);
+ cl_env_reexit(cookie);
+ }
+}
+
+/**
+ * return IF_* type for given lu_dirent entry.
+ * IF_* flag shld be converted to particular OS file type in
+ * platform llite module.
+ */
+__u16 ll_dirent_type_get(struct lu_dirent *ent)
+{
+ __u16 type = 0;
+ struct luda_type *lt;
+ int len = 0;
+
+ if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+ const unsigned align = sizeof(struct luda_type) - 1;
+
+ len = le16_to_cpu(ent->lde_namelen);
+ len = (len + align) & ~align;
+ lt = (void *)ent->lde_name + len;
+ type = IFTODT(le16_to_cpu(lt->lt_type));
+ }
+ return type;
+}
+
+/**
+ * build inode number from passed @fid */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+ if (BITS_PER_LONG == 32 || api32)
+ return fid_flatten32(fid);
+ else
+ return fid_flatten(fid);
+}
+
+/**
+ * build inode generation from passed @fid. If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them. */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+ __u32 gen;
+
+ if (fid_is_igif(fid)) {
+ gen = lu_igif_gen(fid);
+ return gen;
+ }
+
+ gen = fid_flatten(fid) >> 32;
+ return gen;
+}
+
+/* lsm is unreliable after hsm implementation as layout can be changed at
+ * any time. This is only to support old, non-clio-ized interfaces. It will
+ * cause deadlock if clio operations are called with this extra layout refcount
+ * because in case the layout changed during the IO, ll_layout_refresh() will
+ * have to wait for the refcount to become zero to destroy the older layout.
+ *
+ * Notice that the lsm returned by this function may not be valid unless called
+ * inside layout lock - MDS_INODELOCK_LAYOUT. */
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode)
+{
+ return lov_lsm_get(cl_i2info(inode)->lli_clob);
+}
+
+inline void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm)
+{
+ lov_lsm_put(cl_i2info(inode)->lli_clob, lsm);
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
new file mode 100644
index 000000000..01bf894d4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
@@ -0,0 +1,199 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ */
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+
+#include "../include/lustre_lite.h"
+
+
+/* Initialize the default and maximum LOV EA and cookie sizes. This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+ struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
+ __u32 valsize = sizeof(struct lov_desc);
+ int rc, easize, def_easize, cookiesize;
+ struct lov_desc desc;
+ __u16 stripes, def_stripes;
+
+ rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
+ &valsize, &desc, NULL);
+ if (rc)
+ return rc;
+
+ stripes = min_t(__u32, desc.ld_tgt_count, LOV_MAX_STRIPE_COUNT);
+ lsm.lsm_stripe_count = stripes;
+ easize = obd_size_diskmd(dt_exp, &lsm);
+
+ def_stripes = min_t(__u32, desc.ld_default_stripe_count,
+ LOV_MAX_STRIPE_COUNT);
+ lsm.lsm_stripe_count = def_stripes;
+ def_easize = obd_size_diskmd(dt_exp, &lsm);
+
+ cookiesize = stripes * sizeof(struct llog_cookie);
+
+ /* default cookiesize is 0 because from 2.4 server doesn't send
+ * llog cookies to client. */
+ CDEBUG(D_HA,
+ "updating def/max_easize: %d/%d def/max_cookiesize: 0/%d\n",
+ def_easize, easize, cookiesize);
+
+ rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize, 0);
+ return rc;
+}
+
+/**
+ * This function is used as an upcall-callback hooked by liblustre and llite
+ * clients into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See llu_fsswop_mount() and
+ * lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host,
+ struct obd_device *watched,
+ enum obd_notify_event ev, void *owner, void *data)
+{
+ struct lustre_client_ocd *lco;
+ struct client_obd *cli;
+ __u64 flags;
+ int result;
+
+ if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+ cli = &watched->u.cli;
+ lco = owner;
+ flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+ CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n",
+ lco->lco_flags, flags);
+ mutex_lock(&lco->lco_lock);
+ lco->lco_flags &= flags;
+ /* for each osc event update ea size */
+ if (lco->lco_dt_exp)
+ cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+ mutex_unlock(&lco->lco_lock);
+ result = 0;
+ } else {
+ CERROR("unexpected notification from %s %s!\n",
+ watched->obd_type->typ_name,
+ watched->obd_name);
+ result = -EINVAL;
+ }
+ return result;
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+ struct ccc_grouplock *cg)
+{
+ struct lu_env *env;
+ struct cl_io *io;
+ struct cl_lock *lock;
+ struct cl_lock_descr *descr;
+ __u32 enqflags;
+ int refcheck;
+ int rc;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ io = ccc_env_thread_io(env);
+ io->ci_obj = obj;
+ io->ci_ignore_layout = 1;
+
+ rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+ if (rc) {
+ /* Does not make sense to take GL for released layout */
+ if (rc > 0)
+ rc = -ENOTSUPP;
+ cl_env_put(env, &refcheck);
+ return rc;
+ }
+
+ descr = &ccc_env_info(env)->cti_descr;
+ descr->cld_obj = obj;
+ descr->cld_start = 0;
+ descr->cld_end = CL_PAGE_EOF;
+ descr->cld_gid = gid;
+ descr->cld_mode = CLM_GROUP;
+
+ enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+ descr->cld_enq_flags = enqflags;
+
+ lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current);
+ if (IS_ERR(lock)) {
+ cl_io_fini(env, io);
+ cl_env_put(env, &refcheck);
+ return PTR_ERR(lock);
+ }
+
+ cg->cg_env = cl_env_get(&refcheck);
+ cg->cg_io = io;
+ cg->cg_lock = lock;
+ cg->cg_gid = gid;
+ LASSERT(cg->cg_env == env);
+
+ cl_env_unplant(env, &refcheck);
+ return 0;
+}
+
+void cl_put_grouplock(struct ccc_grouplock *cg)
+{
+ struct lu_env *env = cg->cg_env;
+ struct cl_io *io = cg->cg_io;
+ struct cl_lock *lock = cg->cg_lock;
+ int refcheck;
+
+ LASSERT(cg->cg_env);
+ LASSERT(cg->cg_gid);
+
+ cl_env_implant(env, &refcheck);
+ cl_env_put(env, &refcheck);
+
+ cl_unuse(env, lock);
+ cl_lock_release(env, lock, GROUPLOCK_SCOPE, current);
+ cl_io_fini(env, io);
+ cl_env_put(env, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c
new file mode 100644
index 000000000..eab2bd602
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/interval_tree.c
@@ -0,0 +1,751 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/interval_tree.h"
+
+enum {
+ INTERVAL_RED = 0,
+ INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+ LASSERT(node->in_parent != NULL);
+ return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+ LASSERT(node->in_parent != NULL);
+ return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+ return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+ return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+ struct interval_node_extent *e2)
+{
+ int rc;
+
+ if (e1->start == e2->start) {
+ if (e1->end < e2->end)
+ rc = -1;
+ else if (e1->end > e2->end)
+ rc = 1;
+ else
+ rc = 0;
+ } else {
+ if (e1->start < e2->start)
+ rc = -1;
+ else
+ rc = 1;
+ }
+ return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+ struct interval_node_extent *e2)
+{
+ return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+ struct interval_node_extent *e2)
+{
+ return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+ struct interval_node *n2)
+{
+ return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+static inline int node_equal(struct interval_node *n1,
+ struct interval_node *n2)
+{
+ return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+ return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+ return x < y ? x : y;
+}
+
+#define interval_for_each(node, root) \
+for (node = interval_first(root); node != NULL; \
+ node = interval_next(node))
+
+#define interval_for_each_reverse(node, root) \
+for (node = interval_last(root); node != NULL; \
+ node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+ if (!node)
+ return NULL;
+ while (node->in_left)
+ node = node->in_left;
+ return node;
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+ if (!node)
+ return NULL;
+ while (node->in_right)
+ node = node->in_right;
+ return node;
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+ if (!node)
+ return NULL;
+ if (node->in_right)
+ return interval_first(node->in_right);
+ while (node->in_parent && node_is_right_child(node))
+ node = node->in_parent;
+ return node->in_parent;
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+ if (!node)
+ return NULL;
+
+ if (node->in_left)
+ return interval_last(node->in_left);
+
+ while (node->in_parent && node_is_left_child(node))
+ node = node->in_parent;
+
+ return node->in_parent;
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+ interval_callback_t func,
+ void *data)
+{
+ struct interval_node *node;
+ enum interval_iter rc = INTERVAL_ITER_CONT;
+
+ interval_for_each(node, root) {
+ rc = func(node, data);
+ if (rc == INTERVAL_ITER_STOP)
+ break;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+ interval_callback_t func,
+ void *data)
+{
+ struct interval_node *node;
+ enum interval_iter rc = INTERVAL_ITER_CONT;
+
+ interval_for_each_reverse(node, root) {
+ rc = func(node, data);
+ if (rc == INTERVAL_ITER_STOP)
+ break;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+ struct interval_node_extent *ex)
+{
+ struct interval_node *walk = root;
+ int rc;
+
+ while (walk) {
+ rc = extent_compare(ex, &walk->in_extent);
+ if (rc == 0)
+ break;
+ else if (rc < 0)
+ walk = walk->in_left;
+ else
+ walk = walk->in_right;
+ }
+
+ return walk;
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+ struct interval_node *rotate)
+{
+ __u64 left_max, right_max;
+
+ rotate->in_max_high = node->in_max_high;
+ left_max = node->in_left ? node->in_left->in_max_high : 0;
+ right_max = node->in_right ? node->in_right->in_max_high : 0;
+ node->in_max_high = max_u64(interval_high(node),
+ max_u64(left_max, right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child. */
+static void __rotate_left(struct interval_node *node,
+ struct interval_node **root)
+{
+ struct interval_node *right = node->in_right;
+ struct interval_node *parent = node->in_parent;
+
+ node->in_right = right->in_left;
+ if (node->in_right)
+ right->in_left->in_parent = node;
+
+ right->in_left = node;
+ right->in_parent = parent;
+ if (parent) {
+ if (node_is_left_child(node))
+ parent->in_left = right;
+ else
+ parent->in_right = right;
+ } else {
+ *root = right;
+ }
+ node->in_parent = right;
+
+ /* update max_high for node and right */
+ __rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child. */
+static void __rotate_right(struct interval_node *node,
+ struct interval_node **root)
+{
+ struct interval_node *left = node->in_left;
+ struct interval_node *parent = node->in_parent;
+
+ node->in_left = left->in_right;
+ if (node->in_left)
+ left->in_right->in_parent = node;
+ left->in_right = node;
+
+ left->in_parent = parent;
+ if (parent) {
+ if (node_is_right_child(node))
+ parent->in_right = left;
+ else
+ parent->in_left = left;
+ } else {
+ *root = left;
+ }
+ node->in_parent = left;
+
+ /* update max_high for node and left */
+ __rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do { \
+ struct interval_node *c = a; a = b; b = c; \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys,
+ * take O(logN) time.Because they modify the tree, the result
+ * may violate the red-black properties.To restore these properties,
+ * we must change the colors of some of the nodes in the tree
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+ struct interval_node **root)
+{
+ struct interval_node *parent, *gparent;
+
+ while ((parent = node->in_parent) && node_is_red(parent)) {
+ gparent = parent->in_parent;
+ /* Parent is RED, so gparent must not be NULL */
+ if (node_is_left_child(parent)) {
+ struct interval_node *uncle;
+
+ uncle = gparent->in_right;
+ if (uncle && node_is_red(uncle)) {
+ uncle->in_color = INTERVAL_BLACK;
+ parent->in_color = INTERVAL_BLACK;
+ gparent->in_color = INTERVAL_RED;
+ node = gparent;
+ continue;
+ }
+
+ if (parent->in_right == node) {
+ __rotate_left(parent, root);
+ interval_swap(node, parent);
+ }
+
+ parent->in_color = INTERVAL_BLACK;
+ gparent->in_color = INTERVAL_RED;
+ __rotate_right(gparent, root);
+ } else {
+ struct interval_node *uncle;
+
+ uncle = gparent->in_left;
+ if (uncle && node_is_red(uncle)) {
+ uncle->in_color = INTERVAL_BLACK;
+ parent->in_color = INTERVAL_BLACK;
+ gparent->in_color = INTERVAL_RED;
+ node = gparent;
+ continue;
+ }
+
+ if (node_is_left_child(node)) {
+ __rotate_right(parent, root);
+ interval_swap(node, parent);
+ }
+
+ parent->in_color = INTERVAL_BLACK;
+ gparent->in_color = INTERVAL_RED;
+ __rotate_left(gparent, root);
+ }
+ }
+
+ (*root)->in_color = INTERVAL_BLACK;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+ struct interval_node **root)
+
+{
+ struct interval_node **p, *parent = NULL;
+
+ LASSERT(!interval_is_intree(node));
+ p = root;
+ while (*p) {
+ parent = *p;
+ if (node_equal(parent, node))
+ return parent;
+
+ /* max_high field must be updated after each iteration */
+ if (parent->in_max_high < interval_high(node))
+ parent->in_max_high = interval_high(node);
+
+ if (node_compare(node, parent) < 0)
+ p = &parent->in_left;
+ else
+ p = &parent->in_right;
+ }
+
+ /* link node into the tree */
+ node->in_parent = parent;
+ node->in_color = INTERVAL_RED;
+ node->in_left = node->in_right = NULL;
+ *p = node;
+
+ interval_insert_color(node, root);
+ node->in_intree = 1;
+
+ return NULL;
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+ return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+ struct interval_node *parent,
+ struct interval_node **root)
+{
+ struct interval_node *tmp;
+
+ while (node_is_black_or_0(node) && node != *root) {
+ if (parent->in_left == node) {
+ tmp = parent->in_right;
+ if (node_is_red(tmp)) {
+ tmp->in_color = INTERVAL_BLACK;
+ parent->in_color = INTERVAL_RED;
+ __rotate_left(parent, root);
+ tmp = parent->in_right;
+ }
+ if (node_is_black_or_0(tmp->in_left) &&
+ node_is_black_or_0(tmp->in_right)) {
+ tmp->in_color = INTERVAL_RED;
+ node = parent;
+ parent = node->in_parent;
+ } else {
+ if (node_is_black_or_0(tmp->in_right)) {
+ struct interval_node *o_left;
+
+ o_left = tmp->in_left;
+ if (o_left)
+ o_left->in_color = INTERVAL_BLACK;
+ tmp->in_color = INTERVAL_RED;
+ __rotate_right(tmp, root);
+ tmp = parent->in_right;
+ }
+ tmp->in_color = parent->in_color;
+ parent->in_color = INTERVAL_BLACK;
+ if (tmp->in_right)
+ tmp->in_right->in_color = INTERVAL_BLACK;
+ __rotate_left(parent, root);
+ node = *root;
+ break;
+ }
+ } else {
+ tmp = parent->in_left;
+ if (node_is_red(tmp)) {
+ tmp->in_color = INTERVAL_BLACK;
+ parent->in_color = INTERVAL_RED;
+ __rotate_right(parent, root);
+ tmp = parent->in_left;
+ }
+ if (node_is_black_or_0(tmp->in_left) &&
+ node_is_black_or_0(tmp->in_right)) {
+ tmp->in_color = INTERVAL_RED;
+ node = parent;
+ parent = node->in_parent;
+ } else {
+ if (node_is_black_or_0(tmp->in_left)) {
+ struct interval_node *o_right;
+
+ o_right = tmp->in_right;
+ if (o_right)
+ o_right->in_color = INTERVAL_BLACK;
+ tmp->in_color = INTERVAL_RED;
+ __rotate_left(tmp, root);
+ tmp = parent->in_left;
+ }
+ tmp->in_color = parent->in_color;
+ parent->in_color = INTERVAL_BLACK;
+ if (tmp->in_left)
+ tmp->in_left->in_color = INTERVAL_BLACK;
+ __rotate_right(parent, root);
+ node = *root;
+ break;
+ }
+ }
+ }
+ if (node)
+ node->in_color = INTERVAL_BLACK;
+}
+
+/*
+ * if the @max_high value of @node is changed, this function traverse a path
+ * from node up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+ __u64 old_maxhigh)
+{
+ __u64 left_max, right_max;
+
+ while (node) {
+ left_max = node->in_left ? node->in_left->in_max_high : 0;
+ right_max = node->in_right ? node->in_right->in_max_high : 0;
+ node->in_max_high = max_u64(interval_high(node),
+ max_u64(left_max, right_max));
+
+ if (node->in_max_high >= old_maxhigh)
+ break;
+ node = node->in_parent;
+ }
+}
+
+void interval_erase(struct interval_node *node,
+ struct interval_node **root)
+{
+ struct interval_node *child, *parent;
+ int color;
+
+ LASSERT(interval_is_intree(node));
+ node->in_intree = 0;
+ if (!node->in_left) {
+ child = node->in_right;
+ } else if (!node->in_right) {
+ child = node->in_left;
+ } else { /* Both left and right child are not NULL */
+ struct interval_node *old = node;
+
+ node = interval_next(node);
+ child = node->in_right;
+ parent = node->in_parent;
+ color = node->in_color;
+
+ if (child)
+ child->in_parent = parent;
+ if (parent == old)
+ parent->in_right = child;
+ else
+ parent->in_left = child;
+
+ node->in_color = old->in_color;
+ node->in_right = old->in_right;
+ node->in_left = old->in_left;
+ node->in_parent = old->in_parent;
+
+ if (old->in_parent) {
+ if (node_is_left_child(old))
+ old->in_parent->in_left = node;
+ else
+ old->in_parent->in_right = node;
+ } else {
+ *root = node;
+ }
+
+ old->in_left->in_parent = node;
+ if (old->in_right)
+ old->in_right->in_parent = node;
+ update_maxhigh(child ? : parent, node->in_max_high);
+ update_maxhigh(node, old->in_max_high);
+ if (parent == old)
+ parent = node;
+ goto color;
+ }
+ parent = node->in_parent;
+ color = node->in_color;
+
+ if (child)
+ child->in_parent = parent;
+ if (parent) {
+ if (node_is_left_child(node))
+ parent->in_left = child;
+ else
+ parent->in_right = child;
+ } else {
+ *root = child;
+ }
+
+ update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+ if (color == INTERVAL_BLACK)
+ interval_erase_color(child, parent, root);
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+ struct interval_node_extent *ext)
+{
+ return (ext->start <= node->in_max_high &&
+ ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ * if (node == NULL)
+ * return 0;
+ * if (ext->end < interval_low(node)) {
+ * interval_search(node->in_left, ext, func, data);
+ * } else if (interval_may_overlap(node, ext)) {
+ * if (extent_overlapped(ext, &node->in_extent))
+ * func(node, data);
+ * interval_search(node->in_left, ext, func, data);
+ * interval_search(node->in_right, ext, func, data);
+ * }
+ * return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+ struct interval_node_extent *ext,
+ interval_callback_t func,
+ void *data)
+{
+ struct interval_node *parent;
+ enum interval_iter rc = INTERVAL_ITER_CONT;
+
+ LASSERT(ext != NULL);
+ LASSERT(func != NULL);
+
+ while (node) {
+ if (ext->end < interval_low(node)) {
+ if (node->in_left) {
+ node = node->in_left;
+ continue;
+ }
+ } else if (interval_may_overlap(node, ext)) {
+ if (extent_overlapped(ext, &node->in_extent)) {
+ rc = func(node, data);
+ if (rc == INTERVAL_ITER_STOP)
+ break;
+ }
+
+ if (node->in_left) {
+ node = node->in_left;
+ continue;
+ }
+ if (node->in_right) {
+ node = node->in_right;
+ continue;
+ }
+ }
+
+ parent = node->in_parent;
+ while (parent) {
+ if (node_is_left_child(node) &&
+ parent->in_right) {
+ /* If we ever got the left, it means that the
+ * parent met ext->end<interval_low(parent), or
+ * may_overlap(parent). If the former is true,
+ * we needn't go back. So stop early and check
+ * may_overlap(parent) after this loop. */
+ node = parent->in_right;
+ break;
+ }
+ node = parent;
+ parent = parent->in_parent;
+ }
+ if (parent == NULL || !interval_may_overlap(parent, ext))
+ break;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+ void *args)
+{
+ *(int *)args = 1;
+ return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+ struct interval_node_extent *ext)
+{
+ int has = 0;
+ (void)interval_search(root, ext, interval_overlap_cb, &has);
+ return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ * struct interval_node *tmp;
+ * static __u64 res = 0;
+ *
+ * if (root == NULL)
+ * return res;
+ * if (root->in_max_high < low) {
+ * res = max_u64(root->in_max_high + 1, res);
+ * return res;
+ * } else if (low < interval_low(root)) {
+ * interval_expand_low(root->in_left, low);
+ * return res;
+ * }
+ *
+ * if (interval_high(root) < low)
+ * res = max_u64(interval_high(root) + 1, res);
+ * interval_expand_low(root->in_left, low);
+ * interval_expand_low(root->in_right, low);
+ *
+ * return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+ /* we only concern the empty tree right now. */
+ if (root == NULL)
+ return 0;
+ return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+ __u64 result = ~0;
+
+ while (node != NULL) {
+ if (node->in_max_high < high)
+ break;
+
+ if (interval_low(node) > high) {
+ result = interval_low(node) - 1;
+ node = node->in_left;
+ } else {
+ node = node->in_right;
+ }
+ }
+
+ return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+ struct interval_node_extent *ext,
+ struct interval_node_extent *limiter)
+{
+ /* The assertion of interval_is_overlapped is expensive because we may
+ * travel many nodes to find the overlapped node. */
+ LASSERT(interval_is_overlapped(root, ext) == 0);
+ if (!limiter || limiter->start < ext->start)
+ ext->start = interval_expand_low(root, ext->start);
+ if (!limiter || limiter->end > ext->end)
+ ext->end = interval_expand_high(root, ext->end);
+ LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);
diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c
new file mode 100644
index 000000000..cd8ab40e3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/l_lock.c
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_lib.h"
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on lock->l_resource = new_res
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+ /* on server-side resource of lock doesn't change */
+ if ((lock->l_flags & LDLM_FL_NS_SRV) == 0)
+ spin_lock(&lock->l_lock);
+
+ lock_res(lock->l_resource);
+
+ lock->l_flags |= LDLM_FL_RES_LOCKED;
+ return lock->l_resource;
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+ /* on server-side resource of lock doesn't change */
+ lock->l_flags &= ~LDLM_FL_RES_LOCKED;
+
+ unlock_res(lock->l_resource);
+ if ((lock->l_flags & LDLM_FL_NS_SRV) == 0)
+ spin_unlock(&lock->l_lock);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
new file mode 100644
index 000000000..fd9b05936
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect. See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "ldlm_internal.h"
+
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock". This function returns the new KMS value.
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+ struct ldlm_resource *res = lock->l_resource;
+ struct list_head *tmp;
+ struct ldlm_lock *lck;
+ __u64 kms = 0;
+
+ /* don't let another thread in ldlm_extent_shift_kms race in
+ * just after we finish and take our lock into account in its
+ * calculation of the kms */
+ lock->l_flags |= LDLM_FL_KMS_IGNORE;
+
+ list_for_each(tmp, &res->lr_granted) {
+ lck = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+ if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+ continue;
+
+ if (lck->l_policy_data.l_extent.end >= old_kms)
+ return old_kms;
+
+ /* This extent _has_ to be smaller than old_kms (checked above)
+ * so kms can only ever be smaller or the same as old_kms. */
+ if (lck->l_policy_data.l_extent.end + 1 > kms)
+ kms = lck->l_policy_data.l_extent.end + 1;
+ }
+ LASSERTF(kms <= old_kms, "kms %llu old_kms %llu\n", kms, old_kms);
+
+ return kms;
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+ struct ldlm_interval *node;
+
+ LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+ OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+ if (node == NULL)
+ return NULL;
+
+ INIT_LIST_HEAD(&node->li_group);
+ ldlm_interval_attach(node, lock);
+ return node;
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+ if (node) {
+ LASSERT(list_empty(&node->li_group));
+ LASSERT(!interval_is_intree(&node->li_node));
+ OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+ }
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+ struct ldlm_lock *l)
+{
+ LASSERT(l->l_tree_node == NULL);
+ LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+ list_add_tail(&l->l_sl_policy, &n->li_group);
+ l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+ struct ldlm_interval *n = l->l_tree_node;
+
+ if (n == NULL)
+ return NULL;
+
+ LASSERT(!list_empty(&n->li_group));
+ l->l_tree_node = NULL;
+ list_del_init(&l->l_sl_policy);
+
+ return list_empty(&n->li_group) ? n : NULL;
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+ int index;
+
+ LASSERT(mode != 0);
+ LASSERT(IS_PO2(mode));
+ for (index = -1; mode; index++)
+ mode >>= 1;
+ LASSERT(index < LCK_MODE_NUM);
+ return index;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+ struct ldlm_lock *lock)
+{
+ struct interval_node *found, **root;
+ struct ldlm_interval *node;
+ struct ldlm_extent *extent;
+ int idx;
+
+ LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+ node = lock->l_tree_node;
+ LASSERT(node != NULL);
+ LASSERT(!interval_is_intree(&node->li_node));
+
+ idx = lock_mode_to_index(lock->l_granted_mode);
+ LASSERT(lock->l_granted_mode == 1 << idx);
+ LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+ /* node extent initialize */
+ extent = &lock->l_policy_data.l_extent;
+ interval_set(&node->li_node, extent->start, extent->end);
+
+ root = &res->lr_itree[idx].lit_root;
+ found = interval_insert(&node->li_node, root);
+ if (found) { /* The policy group found. */
+ struct ldlm_interval *tmp;
+
+ tmp = ldlm_interval_detach(lock);
+ LASSERT(tmp != NULL);
+ ldlm_interval_free(tmp);
+ ldlm_interval_attach(to_ldlm_interval(found), lock);
+ }
+ res->lr_itree[idx].lit_size++;
+
+ /* even though we use interval tree to manage the extent lock, we also
+ * add the locks into grant list, for debug purpose, .. */
+ ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+ struct ldlm_resource *res = lock->l_resource;
+ struct ldlm_interval *node = lock->l_tree_node;
+ struct ldlm_interval_tree *tree;
+ int idx;
+
+ if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+ return;
+
+ idx = lock_mode_to_index(lock->l_granted_mode);
+ LASSERT(lock->l_granted_mode == 1 << idx);
+ tree = &res->lr_itree[idx];
+
+ LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+ tree->lit_size--;
+ node = ldlm_interval_detach(lock);
+ if (node) {
+ interval_erase(&node->li_node, &tree->lit_root);
+ ldlm_interval_free(node);
+ }
+}
+
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy)
+{
+ memset(lpolicy, 0, sizeof(*lpolicy));
+ lpolicy->l_extent.start = wpolicy->l_extent.start;
+ lpolicy->l_extent.end = wpolicy->l_extent.end;
+ lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy)
+{
+ memset(wpolicy, 0, sizeof(*wpolicy));
+ wpolicy->l_extent.start = lpolicy->l_extent.start;
+ wpolicy->l_extent.end = lpolicy->l_extent.end;
+ wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
new file mode 100644
index 000000000..a4c252feb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
@@ -0,0 +1,859 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+ void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ * and safeguard against removal of a list entry.
+ * \param pos the &struct list_head to use as a loop counter. pos MUST
+ * have been initialized prior to using it in this macro.
+ * \param n another &struct list_head to use as temporary storage
+ * \param head the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+ for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+ return((new->l_policy_data.l_flock.owner ==
+ lock->l_policy_data.l_flock.owner) &&
+ (new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+ return((new->l_policy_data.l_flock.start <=
+ lock->l_policy_data.l_flock.end) &&
+ (new->l_policy_data.l_flock.end >=
+ lock->l_policy_data.l_flock.start));
+}
+
+static inline void ldlm_flock_blocking_link(struct ldlm_lock *req,
+ struct ldlm_lock *lock)
+{
+ /* For server only */
+ if (req->l_export == NULL)
+ return;
+
+ LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+ req->l_policy_data.l_flock.blocking_owner =
+ lock->l_policy_data.l_flock.owner;
+ req->l_policy_data.l_flock.blocking_export =
+ lock->l_export;
+ req->l_policy_data.l_flock.blocking_refs = 0;
+
+ cfs_hash_add(req->l_export->exp_flock_hash,
+ &req->l_policy_data.l_flock.owner,
+ &req->l_exp_flock_hash);
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+ /* For server only */
+ if (req->l_export == NULL)
+ return;
+
+ check_res_locked(req->l_resource);
+ if (req->l_export->exp_flock_hash != NULL &&
+ !hlist_unhashed(&req->l_exp_flock_hash))
+ cfs_hash_del(req->l_export->exp_flock_hash,
+ &req->l_policy_data.l_flock.owner,
+ &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags)
+{
+ LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)",
+ mode, flags);
+
+ /* Safe to not lock here, since it should be empty anyway */
+ LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+ list_del_init(&lock->l_res_link);
+ if (flags == LDLM_FL_WAIT_NOREPROC &&
+ !(lock->l_flags & LDLM_FL_FAILED)) {
+ /* client side - set a flag to prevent sending a CANCEL */
+ lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+ /* when reaching here, it is under lock_res_and_lock(). Thus,
+ need call the nolock version of ldlm_lock_decref_internal*/
+ ldlm_lock_decref_internal_nolock(lock, mode);
+ }
+
+ ldlm_lock_destroy_nolock(lock);
+}
+
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+ struct obd_export *req_exp = req->l_export;
+ struct obd_export *bl_exp = bl_lock->l_export;
+ __u64 req_owner = req->l_policy_data.l_flock.owner;
+ __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+ /* For server only */
+ if (req_exp == NULL)
+ return 0;
+
+ class_export_get(bl_exp);
+ while (1) {
+ struct obd_export *bl_exp_new;
+ struct ldlm_lock *lock = NULL;
+ struct ldlm_flock *flock;
+
+ if (bl_exp->exp_flock_hash != NULL)
+ lock = cfs_hash_lookup(bl_exp->exp_flock_hash,
+ &bl_owner);
+ if (lock == NULL)
+ break;
+
+ LASSERT(req != lock);
+ flock = &lock->l_policy_data.l_flock;
+ LASSERT(flock->owner == bl_owner);
+ bl_owner = flock->blocking_owner;
+ bl_exp_new = class_export_get(flock->blocking_export);
+ class_export_put(bl_exp);
+
+ cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+ bl_exp = bl_exp_new;
+
+ if (bl_owner == req_owner && bl_exp == req_exp) {
+ class_export_put(bl_exp);
+ return 1;
+ }
+ }
+ class_export_put(bl_exp);
+
+ return 0;
+}
+
+static void ldlm_flock_cancel_on_deadlock(struct ldlm_lock *lock,
+ struct list_head *work_list)
+{
+ CDEBUG(D_INFO, "reprocess deadlock req=%p\n", lock);
+
+ if ((exp_connect_flags(lock->l_export) &
+ OBD_CONNECT_FLOCK_DEAD) == 0) {
+ CERROR(
+ "deadlock found, but client doesn't support flock canceliation\n");
+ } else {
+ LASSERT(lock->l_completion_ast);
+ LASSERT((lock->l_flags & LDLM_FL_AST_SENT) == 0);
+ lock->l_flags |= LDLM_FL_AST_SENT | LDLM_FL_CANCEL_ON_BLOCK |
+ LDLM_FL_FLOCK_DEADLOCK;
+ ldlm_flock_blocking_unlink(lock);
+ ldlm_resource_unlink_lock(lock);
+ ldlm_add_ast_work_item(lock, NULL, work_list);
+ }
+}
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ *
+ * It is also responsible for splitting a lock if a portion of the lock
+ * is released.
+ *
+ * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue):
+ * - blocking ASTs have already been sent
+ *
+ * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue):
+ * - blocking ASTs have not been sent yet, so list of conflicting locks
+ * would be collected and ASTs sent.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq,
+ ldlm_error_t *err, struct list_head *work_list)
+{
+ struct ldlm_resource *res = req->l_resource;
+ struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+ struct list_head *tmp;
+ struct list_head *ownlocks = NULL;
+ struct ldlm_lock *lock = NULL;
+ struct ldlm_lock *new = req;
+ struct ldlm_lock *new2 = NULL;
+ ldlm_mode_t mode = req->l_req_mode;
+ int local = ns_is_client(ns);
+ int added = (mode == LCK_NL);
+ int overlaps = 0;
+ int splitted = 0;
+ const struct ldlm_callback_suite null_cbs = { NULL };
+
+ CDEBUG(D_DLMTRACE,
+ "flags %#llx owner %llu pid %u mode %u start %llu end %llu\n",
+ *flags, new->l_policy_data.l_flock.owner,
+ new->l_policy_data.l_flock.pid, mode,
+ req->l_policy_data.l_flock.start,
+ req->l_policy_data.l_flock.end);
+
+ *err = ELDLM_OK;
+
+ if (local) {
+ /* No blocking ASTs are sent to the clients for
+ * Posix file & record locks */
+ req->l_blocking_ast = NULL;
+ } else {
+ /* Called on the server for lock cancels. */
+ req->l_blocking_ast = ldlm_flock_blocking_ast;
+ }
+
+reprocess:
+ if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+ /* This loop determines where this processes locks start
+ * in the resource lr_granted list. */
+ list_for_each(tmp, &res->lr_granted) {
+ lock = list_entry(tmp, struct ldlm_lock,
+ l_res_link);
+ if (ldlm_same_flock_owner(lock, req)) {
+ ownlocks = tmp;
+ break;
+ }
+ }
+ } else {
+ int reprocess_failed = 0;
+
+ lockmode_verify(mode);
+
+ /* This loop determines if there are existing locks
+ * that conflict with the new lock request. */
+ list_for_each(tmp, &res->lr_granted) {
+ lock = list_entry(tmp, struct ldlm_lock,
+ l_res_link);
+
+ if (ldlm_same_flock_owner(lock, req)) {
+ if (!ownlocks)
+ ownlocks = tmp;
+ continue;
+ }
+
+ /* locks are compatible, overlap doesn't matter */
+ if (lockmode_compat(lock->l_granted_mode, mode))
+ continue;
+
+ if (!ldlm_flocks_overlap(lock, req))
+ continue;
+
+ if (!first_enq) {
+ reprocess_failed = 1;
+ if (ldlm_flock_deadlock(req, lock)) {
+ ldlm_flock_cancel_on_deadlock(req,
+ work_list);
+ return LDLM_ITER_CONTINUE;
+ }
+ continue;
+ }
+
+ if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+ ldlm_flock_destroy(req, mode, *flags);
+ *err = -EAGAIN;
+ return LDLM_ITER_STOP;
+ }
+
+ if (*flags & LDLM_FL_TEST_LOCK) {
+ ldlm_flock_destroy(req, mode, *flags);
+ req->l_req_mode = lock->l_granted_mode;
+ req->l_policy_data.l_flock.pid =
+ lock->l_policy_data.l_flock.pid;
+ req->l_policy_data.l_flock.start =
+ lock->l_policy_data.l_flock.start;
+ req->l_policy_data.l_flock.end =
+ lock->l_policy_data.l_flock.end;
+ *flags |= LDLM_FL_LOCK_CHANGED;
+ return LDLM_ITER_STOP;
+ }
+
+ /* add lock to blocking list before deadlock
+ * check to prevent race */
+ ldlm_flock_blocking_link(req, lock);
+
+ if (ldlm_flock_deadlock(req, lock)) {
+ ldlm_flock_blocking_unlink(req);
+ ldlm_flock_destroy(req, mode, *flags);
+ *err = -EDEADLK;
+ return LDLM_ITER_STOP;
+ }
+
+ ldlm_resource_add_lock(res, &res->lr_waiting, req);
+ *flags |= LDLM_FL_BLOCK_GRANTED;
+ return LDLM_ITER_STOP;
+ }
+ if (reprocess_failed)
+ return LDLM_ITER_CONTINUE;
+ }
+
+ if (*flags & LDLM_FL_TEST_LOCK) {
+ ldlm_flock_destroy(req, mode, *flags);
+ req->l_req_mode = LCK_NL;
+ *flags |= LDLM_FL_LOCK_CHANGED;
+ return LDLM_ITER_STOP;
+ }
+
+ /* In case we had slept on this lock request take it off of the
+ * deadlock detection hash list. */
+ ldlm_flock_blocking_unlink(req);
+
+ /* Scan the locks owned by this process that overlap this request.
+ * We may have to merge or split existing locks. */
+
+ if (!ownlocks)
+ ownlocks = &res->lr_granted;
+
+ list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+ lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+ if (!ldlm_same_flock_owner(lock, new))
+ break;
+
+ if (lock->l_granted_mode == mode) {
+ /* If the modes are the same then we need to process
+ * locks that overlap OR adjoin the new lock. The extra
+ * logic condition is necessary to deal with arithmetic
+ * overflow and underflow. */
+ if ((new->l_policy_data.l_flock.start >
+ (lock->l_policy_data.l_flock.end + 1))
+ && (lock->l_policy_data.l_flock.end !=
+ OBD_OBJECT_EOF))
+ continue;
+
+ if ((new->l_policy_data.l_flock.end <
+ (lock->l_policy_data.l_flock.start - 1))
+ && (lock->l_policy_data.l_flock.start != 0))
+ break;
+
+ if (new->l_policy_data.l_flock.start <
+ lock->l_policy_data.l_flock.start) {
+ lock->l_policy_data.l_flock.start =
+ new->l_policy_data.l_flock.start;
+ } else {
+ new->l_policy_data.l_flock.start =
+ lock->l_policy_data.l_flock.start;
+ }
+
+ if (new->l_policy_data.l_flock.end >
+ lock->l_policy_data.l_flock.end) {
+ lock->l_policy_data.l_flock.end =
+ new->l_policy_data.l_flock.end;
+ } else {
+ new->l_policy_data.l_flock.end =
+ lock->l_policy_data.l_flock.end;
+ }
+
+ if (added) {
+ ldlm_flock_destroy(lock, mode, *flags);
+ } else {
+ new = lock;
+ added = 1;
+ }
+ continue;
+ }
+
+ if (new->l_policy_data.l_flock.start >
+ lock->l_policy_data.l_flock.end)
+ continue;
+
+ if (new->l_policy_data.l_flock.end <
+ lock->l_policy_data.l_flock.start)
+ break;
+
+ ++overlaps;
+
+ if (new->l_policy_data.l_flock.start <=
+ lock->l_policy_data.l_flock.start) {
+ if (new->l_policy_data.l_flock.end <
+ lock->l_policy_data.l_flock.end) {
+ lock->l_policy_data.l_flock.start =
+ new->l_policy_data.l_flock.end + 1;
+ break;
+ }
+ ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+ continue;
+ }
+ if (new->l_policy_data.l_flock.end >=
+ lock->l_policy_data.l_flock.end) {
+ lock->l_policy_data.l_flock.end =
+ new->l_policy_data.l_flock.start - 1;
+ continue;
+ }
+
+ /* split the existing lock into two locks */
+
+ /* if this is an F_UNLCK operation then we could avoid
+ * allocating a new lock and use the req lock passed in
+ * with the request but this would complicate the reply
+ * processing since updates to req get reflected in the
+ * reply. The client side replays the lock request so
+ * it must see the original lock data in the reply. */
+
+ /* XXX - if ldlm_lock_new() can sleep we should
+ * release the lr_lock, allocate the new lock,
+ * and restart processing this lock. */
+ if (!new2) {
+ unlock_res_and_lock(req);
+ new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+ lock->l_granted_mode, &null_cbs,
+ NULL, 0, LVB_T_NONE);
+ lock_res_and_lock(req);
+ if (!new2) {
+ ldlm_flock_destroy(req, lock->l_granted_mode,
+ *flags);
+ *err = -ENOLCK;
+ return LDLM_ITER_STOP;
+ }
+ goto reprocess;
+ }
+
+ splitted = 1;
+
+ new2->l_granted_mode = lock->l_granted_mode;
+ new2->l_policy_data.l_flock.pid =
+ new->l_policy_data.l_flock.pid;
+ new2->l_policy_data.l_flock.owner =
+ new->l_policy_data.l_flock.owner;
+ new2->l_policy_data.l_flock.start =
+ lock->l_policy_data.l_flock.start;
+ new2->l_policy_data.l_flock.end =
+ new->l_policy_data.l_flock.start - 1;
+ lock->l_policy_data.l_flock.start =
+ new->l_policy_data.l_flock.end + 1;
+ new2->l_conn_export = lock->l_conn_export;
+ if (lock->l_export != NULL) {
+ new2->l_export = class_export_lock_get(lock->l_export,
+ new2);
+ if (new2->l_export->exp_lock_hash &&
+ hlist_unhashed(&new2->l_exp_hash))
+ cfs_hash_add(new2->l_export->exp_lock_hash,
+ &new2->l_remote_handle,
+ &new2->l_exp_hash);
+ }
+ if (*flags == LDLM_FL_WAIT_NOREPROC)
+ ldlm_lock_addref_internal_nolock(new2,
+ lock->l_granted_mode);
+
+ /* insert new2 at lock */
+ ldlm_resource_add_lock(res, ownlocks, new2);
+ LDLM_LOCK_RELEASE(new2);
+ break;
+ }
+
+ /* if new2 is created but never used, destroy it*/
+ if (splitted == 0 && new2 != NULL)
+ ldlm_lock_destroy_nolock(new2);
+
+ /* At this point we're granting the lock request. */
+ req->l_granted_mode = req->l_req_mode;
+
+ /* Add req to the granted queue before calling ldlm_reprocess_all(). */
+ if (!added) {
+ list_del_init(&req->l_res_link);
+ /* insert new lock before ownlocks in list. */
+ ldlm_resource_add_lock(res, ownlocks, req);
+ }
+
+ if (*flags != LDLM_FL_WAIT_NOREPROC) {
+ /* The only one possible case for client-side calls flock
+ * policy function is ldlm_flock_completion_ast inside which
+ * carries LDLM_FL_WAIT_NOREPROC flag. */
+ CERROR("Illegal parameter for client-side-only module.\n");
+ LBUG();
+ }
+
+ /* In case we're reprocessing the requested lock we can't destroy
+ * it until after calling ldlm_add_ast_work_item() above so that laawi()
+ * can bump the reference count on \a req. Otherwise \a req
+ * could be freed before the completion AST can be sent. */
+ if (added)
+ ldlm_flock_destroy(req, mode, *flags);
+
+ ldlm_resource_dump(D_INFO, res);
+ return LDLM_ITER_CONTINUE;
+}
+
+struct ldlm_flock_wait_data {
+ struct ldlm_lock *fwd_lock;
+ int fwd_generation;
+};
+
+static void
+ldlm_flock_interrupted_wait(void *data)
+{
+ struct ldlm_lock *lock;
+
+ lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock;
+
+ /* take lock off the deadlock detection hash list. */
+ lock_res_and_lock(lock);
+ ldlm_flock_blocking_unlink(lock);
+
+ /* client side - set flag to prevent lock from being put on LRU list */
+ lock->l_flags |= LDLM_FL_CBPENDING;
+ unlock_res_and_lock(lock);
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags [in]: flags
+ * \param *data [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0 : success
+ * \retval <0 : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+ struct file_lock *getlk = lock->l_ast_data;
+ struct obd_device *obd;
+ struct obd_import *imp = NULL;
+ struct ldlm_flock_wait_data fwd;
+ struct l_wait_info lwi;
+ ldlm_error_t err;
+ int rc = 0;
+
+ CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n",
+ flags, data, getlk);
+
+ /* Import invalidation. We need to actually release the lock
+ * references being held, so that it can go away. No point in
+ * holding the lock even if app still believes it has it, since
+ * server already dropped it anyway. Only for granted locks too. */
+ if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
+ (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
+ if (lock->l_req_mode == lock->l_granted_mode &&
+ lock->l_granted_mode != LCK_NL &&
+ NULL == data)
+ ldlm_lock_decref_internal(lock, lock->l_req_mode);
+
+ /* Need to wake up the waiter if we were evicted */
+ wake_up(&lock->l_waitq);
+ return 0;
+ }
+
+ LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+ if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+ LDLM_FL_BLOCK_CONV))) {
+ if (NULL == data)
+ /* mds granted the lock in the reply */
+ goto granted;
+ /* CP AST RPC: lock get granted, wake it up */
+ wake_up(&lock->l_waitq);
+ return 0;
+ }
+
+ LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, sleeping");
+ fwd.fwd_lock = lock;
+ obd = class_exp2obd(lock->l_conn_export);
+
+ /* if this is a local lock, there is no import */
+ if (NULL != obd)
+ imp = obd->u.cli.cl_import;
+
+ if (NULL != imp) {
+ spin_lock(&imp->imp_lock);
+ fwd.fwd_generation = imp->imp_generation;
+ spin_unlock(&imp->imp_lock);
+ }
+
+ lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd);
+
+ /* Go to sleep until the lock is granted. */
+ rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+
+ if (rc) {
+ LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+ rc);
+ return rc;
+ }
+
+granted:
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+ if (lock->l_flags & LDLM_FL_DESTROYED) {
+ LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+ return 0;
+ }
+
+ if (lock->l_flags & LDLM_FL_FAILED) {
+ LDLM_DEBUG(lock, "client-side enqueue waking up: failed");
+ return -EIO;
+ }
+
+ if (rc) {
+ LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+ rc);
+ return rc;
+ }
+
+ LDLM_DEBUG(lock, "client-side enqueue granted");
+
+ lock_res_and_lock(lock);
+
+ /* take lock off the deadlock detection hash list. */
+ ldlm_flock_blocking_unlink(lock);
+
+ /* ldlm_lock_enqueue() has already placed lock on the granted list. */
+ list_del_init(&lock->l_res_link);
+
+ if (lock->l_flags & LDLM_FL_FLOCK_DEADLOCK) {
+ LDLM_DEBUG(lock, "client-side enqueue deadlock received");
+ rc = -EDEADLK;
+ } else if (flags & LDLM_FL_TEST_LOCK) {
+ /* fcntl(F_GETLK) request */
+ /* The old mode was saved in getlk->fl_type so that if the mode
+ * in the lock changes we can decref the appropriate refcount.*/
+ ldlm_flock_destroy(lock, getlk->fl_type, LDLM_FL_WAIT_NOREPROC);
+ switch (lock->l_granted_mode) {
+ case LCK_PR:
+ getlk->fl_type = F_RDLCK;
+ break;
+ case LCK_PW:
+ getlk->fl_type = F_WRLCK;
+ break;
+ default:
+ getlk->fl_type = F_UNLCK;
+ }
+ getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid;
+ getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start;
+ getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end;
+ } else {
+ __u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+ /* We need to reprocess the lock to do merges or splits
+ * with existing locks owned by this process. */
+ ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+ }
+ unlock_res_and_lock(lock);
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+ void *data, int flag)
+{
+ LASSERT(lock);
+ LASSERT(flag == LDLM_CB_CANCELING);
+
+ /* take lock off the deadlock detection hash list. */
+ lock_res_and_lock(lock);
+ ldlm_flock_blocking_unlink(lock);
+ unlock_res_and_lock(lock);
+ return 0;
+}
+
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy)
+{
+ memset(lpolicy, 0, sizeof(*lpolicy));
+ lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+ lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+ lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+ /* Compat code, old clients had no idea about owner field and
+ * relied solely on pid for ownership. Introduced in LU-104, 2.1,
+ * April 2011 */
+ lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid;
+}
+
+
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy)
+{
+ memset(lpolicy, 0, sizeof(*lpolicy));
+ lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+ lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+ lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+ lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy)
+{
+ memset(wpolicy, 0, sizeof(*wpolicy));
+ wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+ wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+ wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+ wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+ struct ldlm_lock *lock;
+
+ lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+ return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+ return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ldlm_lock *lock;
+ struct ldlm_flock *flock;
+
+ lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+ LDLM_LOCK_GET(lock);
+
+ flock = &lock->l_policy_data.l_flock;
+ LASSERT(flock->blocking_export != NULL);
+ class_export_get(flock->blocking_export);
+ flock->blocking_refs++;
+}
+
+static void
+ldlm_export_flock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ldlm_lock *lock;
+ struct ldlm_flock *flock;
+
+ lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+ LDLM_LOCK_RELEASE(lock);
+
+ flock = &lock->l_policy_data.l_flock;
+ LASSERT(flock->blocking_export != NULL);
+ class_export_put(flock->blocking_export);
+ if (--flock->blocking_refs == 0) {
+ flock->blocking_owner = 0;
+ flock->blocking_export = NULL;
+ }
+}
+
+static cfs_hash_ops_t ldlm_export_flock_ops = {
+ .hs_hash = ldlm_export_flock_hash,
+ .hs_key = ldlm_export_flock_key,
+ .hs_keycmp = ldlm_export_flock_keycmp,
+ .hs_object = ldlm_export_flock_object,
+ .hs_get = ldlm_export_flock_get,
+ .hs_put = ldlm_export_flock_put,
+ .hs_put_locked = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+ if (strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0)
+ return 0;
+
+ exp->exp_flock_hash =
+ cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+ HASH_EXP_LOCK_CUR_BITS,
+ HASH_EXP_LOCK_MAX_BITS,
+ HASH_EXP_LOCK_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+ &ldlm_export_flock_ops,
+ CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+ if (!exp->exp_flock_hash)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_init_flock_export);
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+ if (exp->exp_flock_hash) {
+ cfs_hash_putref(exp->exp_flock_hash);
+ exp->exp_flock_hash = NULL;
+ }
+}
+EXPORT_SYMBOL(ldlm_destroy_flock_export);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
new file mode 100644
index 000000000..40d333850
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
@@ -0,0 +1,74 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g. LCK_PW)
+ * are considered conflicting. See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+#include "ldlm_internal.h"
+
+
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy)
+{
+ memset(lpolicy, 0, sizeof(*lpolicy));
+ lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+}
+
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy)
+{
+ memset(wpolicy, 0, sizeof(*wpolicy));
+ wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
new file mode 100644
index 000000000..70b909f55
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
@@ -0,0 +1,316 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define MAX_STRING_SIZE 128
+
+extern int ldlm_srv_namespace_nr;
+extern int ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_active_namespace_list;
+extern struct list_head ldlm_cli_inactive_namespace_list;
+
+static inline int ldlm_namespace_nr_read(ldlm_side_t client)
+{
+ return client == LDLM_NAMESPACE_SERVER ?
+ ldlm_srv_namespace_nr : ldlm_cli_namespace_nr;
+}
+
+static inline void ldlm_namespace_nr_inc(ldlm_side_t client)
+{
+ if (client == LDLM_NAMESPACE_SERVER)
+ ldlm_srv_namespace_nr++;
+ else
+ ldlm_cli_namespace_nr++;
+}
+
+static inline void ldlm_namespace_nr_dec(ldlm_side_t client)
+{
+ if (client == LDLM_NAMESPACE_SERVER)
+ ldlm_srv_namespace_nr--;
+ else
+ ldlm_cli_namespace_nr--;
+}
+
+static inline struct list_head *ldlm_namespace_list(ldlm_side_t client)
+{
+ return client == LDLM_NAMESPACE_SERVER ?
+ &ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list;
+}
+
+static inline struct list_head *ldlm_namespace_inactive_list(ldlm_side_t client)
+{
+ return client == LDLM_NAMESPACE_SERVER ?
+ &ldlm_srv_namespace_list : &ldlm_cli_inactive_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client)
+{
+ return client == LDLM_NAMESPACE_SERVER ?
+ &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ns_bref is the number of resources in this namespace */
+static inline int ldlm_ns_empty(struct ldlm_namespace *ns)
+{
+ return atomic_read(&ns->ns_bref) == 0;
+}
+
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *, ldlm_side_t);
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *,
+ ldlm_side_t);
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t);
+
+/* ldlm_request.c */
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum {
+ LDLM_CANCEL_AGED = 1 << 0, /* Cancel aged locks (non lru resize). */
+ LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */
+ LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */
+ LDLM_CANCEL_LRUR = 1 << 3, /* Cancel locks from lru resize. */
+ LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither
+ * sending nor waiting for any rpcs) */
+};
+
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+ ldlm_cancel_flags_t sync, int flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+ struct list_head *cancels, int count, int max,
+ ldlm_cancel_flags_t cancel_flags, int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
+
+/* ldlm_resource.c */
+int ldlm_resource_putref_locked(struct ldlm_resource *res);
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+ struct ldlm_lock *new);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+ struct obd_import *imp, int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+/* ldlm_lock.c */
+
+struct ldlm_cb_set_arg {
+ struct ptlrpc_request_set *set;
+ int type; /* LDLM_{CP,BL,GL}_CALLBACK */
+ atomic_t restart;
+ struct list_head *list;
+ union ldlm_gl_desc *gl_desc; /* glimpse AST descriptor */
+};
+
+enum ldlm_desc_ast_t {
+ LDLM_WORK_BL_AST,
+ LDLM_WORK_CP_AST,
+ LDLM_WORK_REVOKE_AST,
+ LDLM_WORK_GL_AST
+};
+
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+ enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+ ldlm_type_t type, ldlm_mode_t,
+ const struct ldlm_callback_suite *cbs,
+ void *data, __u32 lvb_len, enum lvb_type lvb_type);
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+ void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+ struct list_head *work_list);
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+ enum ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock);
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+void ldlm_cancel_locks_for_export(struct obd_export *export);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+ struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+ struct ldlm_lock_desc *ld,
+ struct list_head *cancels, int count,
+ ldlm_cancel_flags_t cancel_flags);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+ struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+extern struct kmem_cache *ldlm_resource_slab;
+
+/* ldlm_lockd.c & ldlm_lock.c */
+extern struct kmem_cache *ldlm_lock_slab;
+
+/* ldlm_extent.c */
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+ int first_enq, ldlm_error_t *err,
+ struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern struct proc_dir_entry *ldlm_svc_proc_dir;
+extern struct proc_dir_entry *ldlm_type_proc_dir;
+
+struct ldlm_state {
+ struct ptlrpc_service *ldlm_cb_service;
+ struct ptlrpc_service *ldlm_cancel_service;
+ struct ptlrpc_client *ldlm_client;
+ struct ptlrpc_connection *ldlm_server_conn;
+ struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+ struct ldlm_lock *lock;
+
+ LASSERT(!list_empty(&node->li_group));
+
+ lock = list_entry(node->li_group.next, struct ldlm_lock,
+ l_sl_policy);
+ return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+ LDLM_POLICY_CANCEL_LOCK,
+ LDLM_POLICY_KEEP_LOCK,
+ LDLM_POLICY_SKIP_LOCK
+};
+
+typedef enum ldlm_policy_res ldlm_policy_res_t;
+
+#define LDLM_POOL_PROC_READER_SEQ_SHOW(var, type) \
+ static int lprocfs_##var##_seq_show(struct seq_file *m, void *v) \
+ { \
+ struct ldlm_pool *pl = m->private; \
+ type tmp; \
+ \
+ spin_lock(&pl->pl_lock); \
+ tmp = pl->pl_##var; \
+ spin_unlock(&pl->pl_lock); \
+ \
+ return lprocfs_rd_uint(m, &tmp); \
+ } \
+ struct __##var##__dummy_read {; } /* semicolon catcher */
+
+#define LDLM_POOL_PROC_WRITER(var, type) \
+ static int lprocfs_wr_##var(struct file *file, \
+ const char __user *buffer, \
+ unsigned long count, void *data) \
+ { \
+ struct ldlm_pool *pl = data; \
+ type tmp; \
+ int rc; \
+ \
+ rc = lprocfs_wr_uint(file, buffer, count, &tmp); \
+ if (rc < 0) { \
+ CERROR("Can't parse user input, rc = %d\n", rc); \
+ return rc; \
+ } \
+ \
+ spin_lock(&pl->pl_lock); \
+ pl->pl_##var = tmp; \
+ spin_unlock(&pl->pl_lock); \
+ \
+ return rc; \
+ } \
+ struct __##var##__dummy_write {; } /* semicolon catcher */
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+ int ret = 0;
+
+ lock_res_and_lock(lock);
+ if (((lock->l_req_mode == lock->l_granted_mode) &&
+ !(lock->l_flags & LDLM_FL_CP_REQD)) ||
+ (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL)))
+ ret = 1;
+ unlock_res_and_lock(lock);
+
+ return ret;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *,
+ ldlm_policy_data_t *);
+
+typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *,
+ ldlm_wire_policy_data_t *);
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy);
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy);
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy);
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy);
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy);
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy);
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
new file mode 100644
index 000000000..c5c86e73c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
@@ -0,0 +1,870 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+#include "ldlm_internal.h"
+
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority, int create)
+{
+ struct ptlrpc_connection *ptlrpc_conn;
+ struct obd_import_conn *imp_conn = NULL, *item;
+ int rc = 0;
+
+ if (!create && !priority) {
+ CDEBUG(D_HA, "Nothing to do\n");
+ return -EINVAL;
+ }
+
+ ptlrpc_conn = ptlrpc_uuid_to_connection(uuid);
+ if (!ptlrpc_conn) {
+ CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+ return -ENOENT;
+ }
+
+ if (create) {
+ OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+ if (!imp_conn) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+ }
+
+ spin_lock(&imp->imp_lock);
+ list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+ if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+ if (priority) {
+ list_del(&item->oic_item);
+ list_add(&item->oic_item,
+ &imp->imp_conn_list);
+ item->oic_last_attempt = 0;
+ }
+ CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+ imp, imp->imp_obd->obd_name, uuid->uuid,
+ (priority ? ", moved to head" : ""));
+ spin_unlock(&imp->imp_lock);
+ rc = 0;
+ goto out_free;
+ }
+ }
+ /* No existing import connection found for \a uuid. */
+ if (create) {
+ imp_conn->oic_conn = ptlrpc_conn;
+ imp_conn->oic_uuid = *uuid;
+ imp_conn->oic_last_attempt = 0;
+ if (priority)
+ list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+ else
+ list_add_tail(&imp_conn->oic_item,
+ &imp->imp_conn_list);
+ CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+ imp, imp->imp_obd->obd_name, uuid->uuid,
+ (priority ? "head" : "tail"));
+ } else {
+ spin_unlock(&imp->imp_lock);
+ rc = -ENOENT;
+ goto out_free;
+ }
+
+ spin_unlock(&imp->imp_lock);
+ return 0;
+out_free:
+ if (imp_conn)
+ OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+ ptlrpc_connection_put(ptlrpc_conn);
+ return rc;
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+ return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+ int priority)
+{
+ return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+ struct obd_import_conn *imp_conn;
+ struct obd_export *dlmexp;
+ int rc = -ENOENT;
+
+ spin_lock(&imp->imp_lock);
+ if (list_empty(&imp->imp_conn_list)) {
+ LASSERT(!imp->imp_connection);
+ goto out;
+ }
+
+ list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+ if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+ continue;
+ LASSERT(imp_conn->oic_conn);
+
+ if (imp_conn == imp->imp_conn_current) {
+ LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+ if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+ imp->imp_state != LUSTRE_IMP_DISCON) {
+ CERROR("can't remove current connection\n");
+ rc = -EBUSY;
+ goto out;
+ }
+
+ ptlrpc_connection_put(imp->imp_connection);
+ imp->imp_connection = NULL;
+
+ dlmexp = class_conn2export(&imp->imp_dlm_handle);
+ if (dlmexp && dlmexp->exp_connection) {
+ LASSERT(dlmexp->exp_connection ==
+ imp_conn->oic_conn);
+ ptlrpc_connection_put(dlmexp->exp_connection);
+ dlmexp->exp_connection = NULL;
+ }
+ }
+
+ list_del(&imp_conn->oic_item);
+ ptlrpc_connection_put(imp_conn->oic_conn);
+ OBD_FREE(imp_conn, sizeof(*imp_conn));
+ CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+ imp, imp->imp_obd->obd_name, uuid->uuid);
+ rc = 0;
+ break;
+ }
+out:
+ spin_unlock(&imp->imp_lock);
+ if (rc == -ENOENT)
+ CERROR("connection %s not found\n", uuid->uuid);
+ return rc;
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+ struct obd_uuid *uuid)
+{
+ struct obd_import_conn *conn;
+ int rc = -ENOENT;
+
+ spin_lock(&imp->imp_lock);
+ list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+ /* Check if conn UUID does have this peer NID. */
+ if (class_check_uuid(&conn->oic_uuid, peer)) {
+ *uuid = conn->oic_uuid;
+ rc = 0;
+ break;
+ }
+ }
+ spin_unlock(&imp->imp_lock);
+ return rc;
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+ /* Drop security policy instance after all RPCs have finished/aborted
+ * to let all busy contexts be released. */
+ class_import_get(imp);
+ class_destroy_import(imp);
+ sptlrpc_import_sec_put(imp);
+ class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ * setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ * setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+ char *ptr;
+
+ ptr = strrchr(obdname, '-');
+ if (ptr == NULL)
+ return 0;
+
+ if (strncmp(ptr + 1, "MDT", 3) == 0)
+ return 1;
+
+ return 0;
+}
+
+/* Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+ struct client_obd *cli = &obddev->u.cli;
+ struct obd_import *imp;
+ struct obd_uuid server_uuid;
+ int rq_portal, rp_portal, connect_op;
+ char *name = obddev->obd_type->typ_name;
+ ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+ int rc;
+
+ /* In a more perfect world, we would hang a ptlrpc_client off of
+ * obd_type and just use the values from there. */
+ if (!strcmp(name, LUSTRE_OSC_NAME)) {
+ rq_portal = OST_REQUEST_PORTAL;
+ rp_portal = OSC_REPLY_PORTAL;
+ connect_op = OST_CONNECT;
+ cli->cl_sp_me = LUSTRE_SP_CLI;
+ cli->cl_sp_to = LUSTRE_SP_OST;
+ ns_type = LDLM_NS_TYPE_OSC;
+ } else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+ !strcmp(name, LUSTRE_LWP_NAME)) {
+ rq_portal = MDS_REQUEST_PORTAL;
+ rp_portal = MDC_REPLY_PORTAL;
+ connect_op = MDS_CONNECT;
+ cli->cl_sp_me = LUSTRE_SP_CLI;
+ cli->cl_sp_to = LUSTRE_SP_MDT;
+ ns_type = LDLM_NS_TYPE_MDC;
+ } else if (!strcmp(name, LUSTRE_OSP_NAME)) {
+ if (strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL) {
+ /* OSP_on_MDT for other MDTs */
+ connect_op = MDS_CONNECT;
+ cli->cl_sp_to = LUSTRE_SP_MDT;
+ ns_type = LDLM_NS_TYPE_MDC;
+ rq_portal = OUT_PORTAL;
+ } else {
+ /* OSP on MDT for OST */
+ connect_op = OST_CONNECT;
+ cli->cl_sp_to = LUSTRE_SP_OST;
+ ns_type = LDLM_NS_TYPE_OSC;
+ rq_portal = OST_REQUEST_PORTAL;
+ }
+ rp_portal = OSC_REPLY_PORTAL;
+ cli->cl_sp_me = LUSTRE_SP_CLI;
+ } else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+ rq_portal = MGS_REQUEST_PORTAL;
+ rp_portal = MGC_REPLY_PORTAL;
+ connect_op = MGS_CONNECT;
+ cli->cl_sp_me = LUSTRE_SP_MGC;
+ cli->cl_sp_to = LUSTRE_SP_MGS;
+ cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+ ns_type = LDLM_NS_TYPE_MGC;
+ } else {
+ CERROR("unknown client OBD type \"%s\", can't setup\n",
+ name);
+ return -EINVAL;
+ }
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+ CERROR("requires a TARGET UUID\n");
+ return -EINVAL;
+ }
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+ CERROR("client UUID must be less than 38 characters\n");
+ return -EINVAL;
+ }
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+ CERROR("setup requires a SERVER UUID\n");
+ return -EINVAL;
+ }
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+ CERROR("target UUID must be less than 38 characters\n");
+ return -EINVAL;
+ }
+
+ init_rwsem(&cli->cl_sem);
+ mutex_init(&cli->cl_mgc_mutex);
+ cli->cl_conn_count = 0;
+ memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+ min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+ sizeof(server_uuid)));
+
+ cli->cl_dirty = 0;
+ cli->cl_avail_grant = 0;
+ /* FIXME: Should limit this for the sum of all cl_dirty_max. */
+ cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
+ if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > totalram_pages / 8)
+ cli->cl_dirty_max = totalram_pages << (PAGE_CACHE_SHIFT - 3);
+ INIT_LIST_HEAD(&cli->cl_cache_waiters);
+ INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+ INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+ INIT_LIST_HEAD(&cli->cl_loi_write_list);
+ INIT_LIST_HEAD(&cli->cl_loi_read_list);
+ client_obd_list_lock_init(&cli->cl_loi_list_lock);
+ atomic_set(&cli->cl_pending_w_pages, 0);
+ atomic_set(&cli->cl_pending_r_pages, 0);
+ cli->cl_r_in_flight = 0;
+ cli->cl_w_in_flight = 0;
+
+ spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+ spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+ spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+ spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+ spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+ spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+ /* lru for osc. */
+ INIT_LIST_HEAD(&cli->cl_lru_osc);
+ atomic_set(&cli->cl_lru_shrinkers, 0);
+ atomic_set(&cli->cl_lru_busy, 0);
+ atomic_set(&cli->cl_lru_in_list, 0);
+ INIT_LIST_HEAD(&cli->cl_lru_list);
+ client_obd_list_lock_init(&cli->cl_lru_list_lock);
+
+ init_waitqueue_head(&cli->cl_destroy_waitq);
+ atomic_set(&cli->cl_destroy_in_flight, 0);
+ /* Turn on checksumming by default. */
+ cli->cl_checksum = 1;
+ /*
+ * The supported checksum types will be worked out at connect time
+ * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+ * through procfs.
+ */
+ cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+ atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+ /* This value may be reduced at connect time in
+ * ptlrpc_connect_interpret() . We initialize it to only
+ * 1MB until we know what the performance looks like.
+ * In the future this should likely be increased. LU-1431 */
+ cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
+ LNET_MTU >> PAGE_CACHE_SHIFT);
+
+ if (!strcmp(name, LUSTRE_MDC_NAME)) {
+ cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT;
+ } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) {
+ cli->cl_max_rpcs_in_flight = 2;
+ } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) {
+ cli->cl_max_rpcs_in_flight = 3;
+ } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) {
+ cli->cl_max_rpcs_in_flight = 4;
+ } else {
+ if (osc_on_mdt(obddev->obd_name))
+ cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT;
+ else
+ cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
+ }
+ rc = ldlm_get_ref();
+ if (rc) {
+ CERROR("ldlm_get_ref failed: %d\n", rc);
+ goto err;
+ }
+
+ ptlrpc_init_client(rq_portal, rp_portal, name,
+ &obddev->obd_ldlm_client);
+
+ imp = class_new_import(obddev);
+ if (imp == NULL) {
+ rc = -ENOENT;
+ goto err_ldlm;
+ }
+ imp->imp_client = &obddev->obd_ldlm_client;
+ imp->imp_connect_op = connect_op;
+ memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+ LUSTRE_CFG_BUFLEN(lcfg, 1));
+ class_import_put(imp);
+
+ rc = client_import_add_conn(imp, &server_uuid, 1);
+ if (rc) {
+ CERROR("can't add initial connection\n");
+ goto err_import;
+ }
+
+ cli->cl_import = imp;
+ /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
+ cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+ cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+ if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+ CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+ name, obddev->obd_name,
+ cli->cl_target_uuid.uuid);
+ spin_lock(&imp->imp_lock);
+ imp->imp_deactive = 1;
+ spin_unlock(&imp->imp_lock);
+ }
+ }
+
+ obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name,
+ LDLM_NAMESPACE_CLIENT,
+ LDLM_NAMESPACE_GREEDY,
+ ns_type);
+ if (obddev->obd_namespace == NULL) {
+ CERROR("Unable to create client namespace - %s\n",
+ obddev->obd_name);
+ rc = -ENOMEM;
+ goto err_import;
+ }
+
+ cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
+
+ return rc;
+
+err_import:
+ class_destroy_import(imp);
+err_ldlm:
+ ldlm_put_ref();
+err:
+ return rc;
+
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obddev)
+{
+ ldlm_namespace_free_post(obddev->obd_namespace);
+ obddev->obd_namespace = NULL;
+
+ LASSERT(obddev->u.cli.cl_import == NULL);
+
+ ldlm_put_ref();
+ return 0;
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+ struct obd_export **exp,
+ struct obd_device *obd, struct obd_uuid *cluuid,
+ struct obd_connect_data *data, void *localdata)
+{
+ struct client_obd *cli = &obd->u.cli;
+ struct obd_import *imp = cli->cl_import;
+ struct obd_connect_data *ocd;
+ struct lustre_handle conn = { 0 };
+ int rc;
+
+ *exp = NULL;
+ down_write(&cli->cl_sem);
+ if (cli->cl_conn_count > 0) {
+ rc = -EALREADY;
+ goto out_sem;
+ }
+
+ rc = class_connect(&conn, obd, cluuid);
+ if (rc)
+ goto out_sem;
+
+ cli->cl_conn_count++;
+ *exp = class_conn2export(&conn);
+
+ LASSERT(obd->obd_namespace);
+
+ imp->imp_dlm_handle = conn;
+ rc = ptlrpc_init_import(imp);
+ if (rc != 0)
+ goto out_ldlm;
+
+ ocd = &imp->imp_connect_data;
+ if (data) {
+ *ocd = *data;
+ imp->imp_connect_flags_orig = data->ocd_connect_flags;
+ }
+
+ rc = ptlrpc_connect_import(imp);
+ if (rc != 0) {
+ LASSERT(imp->imp_state == LUSTRE_IMP_DISCON);
+ goto out_ldlm;
+ }
+ LASSERT(*exp != NULL && (*exp)->exp_connection);
+
+ if (data) {
+ LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+ ocd->ocd_connect_flags, "old %#llx, new %#llx\n",
+ data->ocd_connect_flags, ocd->ocd_connect_flags);
+ data->ocd_connect_flags = ocd->ocd_connect_flags;
+ }
+
+ ptlrpc_pinger_add_import(imp);
+
+ if (rc) {
+out_ldlm:
+ cli->cl_conn_count--;
+ class_disconnect(*exp);
+ *exp = NULL;
+ }
+out_sem:
+ up_write(&cli->cl_sem);
+
+ return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct client_obd *cli;
+ struct obd_import *imp;
+ int rc = 0, err;
+
+ if (!obd) {
+ CERROR("invalid export for disconnect: exp %p cookie %#llx\n",
+ exp, exp ? exp->exp_handle.h_cookie : -1);
+ return -EINVAL;
+ }
+
+ cli = &obd->u.cli;
+ imp = cli->cl_import;
+
+ down_write(&cli->cl_sem);
+ CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
+ cli->cl_conn_count);
+
+ if (!cli->cl_conn_count) {
+ CERROR("disconnecting disconnected device (%s)\n",
+ obd->obd_name);
+ rc = -EINVAL;
+ goto out_disconnect;
+ }
+
+ cli->cl_conn_count--;
+ if (cli->cl_conn_count) {
+ rc = 0;
+ goto out_disconnect;
+ }
+
+ /* Mark import deactivated now, so we don't try to reconnect if any
+ * of the cleanup RPCs fails (e.g. LDLM cancel, etc). We don't
+ * fully deactivate the import, or that would drop all requests. */
+ spin_lock(&imp->imp_lock);
+ imp->imp_deactive = 1;
+ spin_unlock(&imp->imp_lock);
+
+ /* Some non-replayable imports (MDS's OSCs) are pinged, so just
+ * delete it regardless. (It's safe to delete an import that was
+ * never added.) */
+ (void)ptlrpc_pinger_del_import(imp);
+
+ if (obd->obd_namespace != NULL) {
+ /* obd_force == local only */
+ ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+ obd->obd_force ? LCF_LOCAL : 0, NULL);
+ ldlm_namespace_free_prior(obd->obd_namespace, imp,
+ obd->obd_force);
+ }
+
+ /* There's no need to hold sem while disconnecting an import,
+ * and it may actually cause deadlock in GSS. */
+ up_write(&cli->cl_sem);
+ rc = ptlrpc_disconnect_import(imp, 0);
+ down_write(&cli->cl_sem);
+
+ ptlrpc_invalidate_import(imp);
+
+out_disconnect:
+ /* Use server style - class_disconnect should be always called for
+ * o_disconnect. */
+ err = class_disconnect(exp);
+ if (!rc && err)
+ rc = err;
+
+ up_write(&cli->cl_sem);
+
+ return rc;
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+ struct obd_device *obd;
+
+ /* Check that we still have all structures alive as this may
+ * be some late RPC at shutdown time. */
+ if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+ !exp_connect_lru_resize(req->rq_export))) {
+ lustre_msg_set_slv(req->rq_repmsg, 0);
+ lustre_msg_set_limit(req->rq_repmsg, 0);
+ return 0;
+ }
+
+ /* OBD is alive here as export is alive, which we checked above. */
+ obd = req->rq_export->exp_obd;
+
+ read_lock(&obd->obd_pool_lock);
+ lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+ lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+ read_unlock(&obd->obd_pool_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(target_pack_pool_reply);
+
+int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+{
+ if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+ DEBUG_REQ(D_ERROR, req, "dropping reply");
+ return -ECOMM;
+ }
+
+ if (unlikely(rc)) {
+ DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+ req->rq_status = rc;
+ return ptlrpc_send_error(req, 1);
+ } else {
+ DEBUG_REQ(D_NET, req, "sending reply");
+ }
+
+ return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT);
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+ struct ptlrpc_service_part *svcpt;
+ int netrc;
+ struct ptlrpc_reply_state *rs;
+ struct obd_export *exp;
+
+ if (req->rq_no_reply)
+ return;
+
+ svcpt = req->rq_rqbd->rqbd_svcpt;
+ rs = req->rq_reply_state;
+ if (rs == NULL || !rs->rs_difficult) {
+ /* no notifiers */
+ target_send_reply_msg(req, rc, fail_id);
+ return;
+ }
+
+ /* must be an export if locks saved */
+ LASSERT(req->rq_export != NULL);
+ /* req/reply consistent */
+ LASSERT(rs->rs_svcpt == svcpt);
+
+ /* "fresh" reply */
+ LASSERT(!rs->rs_scheduled);
+ LASSERT(!rs->rs_scheduled_ever);
+ LASSERT(!rs->rs_handled);
+ LASSERT(!rs->rs_on_net);
+ LASSERT(rs->rs_export == NULL);
+ LASSERT(list_empty(&rs->rs_obd_list));
+ LASSERT(list_empty(&rs->rs_exp_list));
+
+ exp = class_export_get(req->rq_export);
+
+ /* disable reply scheduling while I'm setting up */
+ rs->rs_scheduled = 1;
+ rs->rs_on_net = 1;
+ rs->rs_xid = req->rq_xid;
+ rs->rs_transno = req->rq_transno;
+ rs->rs_export = exp;
+ rs->rs_opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+ spin_lock(&exp->exp_uncommitted_replies_lock);
+ CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n",
+ rs->rs_transno, exp->exp_last_committed);
+ if (rs->rs_transno > exp->exp_last_committed) {
+ /* not committed already */
+ list_add_tail(&rs->rs_obd_list,
+ &exp->exp_uncommitted_replies);
+ }
+ spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+ spin_lock(&exp->exp_lock);
+ list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+ spin_unlock(&exp->exp_lock);
+
+ netrc = target_send_reply_msg(req, rc, fail_id);
+
+ spin_lock(&svcpt->scp_rep_lock);
+
+ atomic_inc(&svcpt->scp_nreps_difficult);
+
+ if (netrc != 0) {
+ /* error sending: reply is off the net. Also we need +1
+ * reply ref until ptlrpc_handle_rs() is done
+ * with the reply state (if the send was successful, there
+ * would have been +1 ref for the net, which
+ * reply_out_callback leaves alone) */
+ rs->rs_on_net = 0;
+ ptlrpc_rs_addref(rs);
+ }
+
+ spin_lock(&rs->rs_lock);
+ if (rs->rs_transno <= exp->exp_last_committed ||
+ (!rs->rs_on_net && !rs->rs_no_ack) ||
+ list_empty(&rs->rs_exp_list) || /* completed already */
+ list_empty(&rs->rs_obd_list)) {
+ CDEBUG(D_HA, "Schedule reply immediately\n");
+ ptlrpc_dispatch_difficult_reply(rs);
+ } else {
+ list_add(&rs->rs_list, &svcpt->scp_rep_active);
+ rs->rs_scheduled = 0; /* allow notifier to schedule */
+ }
+ spin_unlock(&rs->rs_lock);
+ spin_unlock(&svcpt->scp_rep_lock);
+}
+EXPORT_SYMBOL(target_send_reply);
+
+ldlm_mode_t lck_compat_array[] = {
+ [LCK_EX] = LCK_COMPAT_EX,
+ [LCK_PW] = LCK_COMPAT_PW,
+ [LCK_PR] = LCK_COMPAT_PR,
+ [LCK_CW] = LCK_COMPAT_CW,
+ [LCK_CR] = LCK_COMPAT_CR,
+ [LCK_NL] = LCK_COMPAT_NL,
+ [LCK_GROUP] = LCK_COMPAT_GROUP,
+ [LCK_COS] = LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(ldlm_error_t error)
+{
+ int result;
+
+ switch (error) {
+ case ELDLM_OK:
+ result = 0;
+ break;
+ case ELDLM_LOCK_CHANGED:
+ result = -ESTALE;
+ break;
+ case ELDLM_LOCK_ABORTED:
+ result = -ENAVAIL;
+ break;
+ case ELDLM_LOCK_REPLACED:
+ result = -ESRCH;
+ break;
+ case ELDLM_NO_LOCK_DATA:
+ result = -ENOENT;
+ break;
+ case ELDLM_NAMESPACE_EXISTS:
+ result = -EEXIST;
+ break;
+ case ELDLM_BAD_NAMESPACE:
+ result = -EBADF;
+ break;
+ default:
+ if (((int)error) < 0) /* cast to signed type */
+ result = error; /* as ldlm_error_t can be unsigned */
+ else {
+ CERROR("Invalid DLM result code: %d\n", error);
+ result = -EPROTO;
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t.
+ */
+ldlm_error_t ldlm_errno2error(int err_no)
+{
+ int error;
+
+ switch (err_no) {
+ case 0:
+ error = ELDLM_OK;
+ break;
+ case -ESTALE:
+ error = ELDLM_LOCK_CHANGED;
+ break;
+ case -ENAVAIL:
+ error = ELDLM_LOCK_ABORTED;
+ break;
+ case -ESRCH:
+ error = ELDLM_LOCK_REPLACED;
+ break;
+ case -ENOENT:
+ error = ELDLM_NO_LOCK_DATA;
+ break;
+ case -EEXIST:
+ error = ELDLM_NAMESPACE_EXISTS;
+ break;
+ case -EBADF:
+ error = ELDLM_BAD_NAMESPACE;
+ break;
+ default:
+ error = err_no;
+ }
+ return error;
+}
+EXPORT_SYMBOL(ldlm_errno2error);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+ spin_lock(&exp->exp_locks_list_guard);
+ if (!list_empty(&exp->exp_locks_list)) {
+ struct ldlm_lock *lock;
+
+ CERROR("dumping locks for export %p,ignore if the unmount doesn't hang\n",
+ exp);
+ list_for_each_entry(lock, &exp->exp_locks_list,
+ l_exp_refs_link)
+ LDLM_ERROR(lock, "lock:");
+ }
+ spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
new file mode 100644
index 000000000..84b111eb4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
@@ -0,0 +1,2322 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_intent.h"
+#include "../include/obd_class.h"
+#include "ldlm_internal.h"
+
+/* lock types */
+char *ldlm_lockname[] = {
+ [0] = "--",
+ [LCK_EX] = "EX",
+ [LCK_PW] = "PW",
+ [LCK_PR] = "PR",
+ [LCK_CW] = "CW",
+ [LCK_CR] = "CR",
+ [LCK_NL] = "NL",
+ [LCK_GROUP] = "GROUP",
+ [LCK_COS] = "COS",
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+ [LDLM_PLAIN] = "PLN",
+ [LDLM_EXTENT] = "EXT",
+ [LDLM_FLOCK] = "FLK",
+ [LDLM_IBITS] = "IBT",
+};
+EXPORT_SYMBOL(ldlm_typename);
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = {
+ [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_wire_to_local,
+ [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local,
+ [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_wire18_to_local,
+ [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = {
+ [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_wire_to_local,
+ [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local,
+ [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_wire21_to_local,
+ [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+ [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_local_to_wire,
+ [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_local_to_wire,
+ [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_local_to_wire,
+ [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+ const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy)
+{
+ ldlm_policy_local_to_wire_t convert;
+
+ convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+ convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+ const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy)
+{
+ ldlm_policy_wire_to_local_t convert;
+ int new_client;
+
+ /** some badness for 2.0.0 clients, but 2.0.0 isn't supported */
+ new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0;
+ if (new_client)
+ convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE];
+ else
+ convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE];
+
+ convert(wpolicy, lpolicy);
+}
+
+char *ldlm_it2str(int it)
+{
+ switch (it) {
+ case IT_OPEN:
+ return "open";
+ case IT_CREAT:
+ return "creat";
+ case (IT_OPEN | IT_CREAT):
+ return "open|creat";
+ case IT_READDIR:
+ return "readdir";
+ case IT_GETATTR:
+ return "getattr";
+ case IT_LOOKUP:
+ return "lookup";
+ case IT_UNLINK:
+ return "unlink";
+ case IT_GETXATTR:
+ return "getxattr";
+ case IT_LAYOUT:
+ return "layout";
+ default:
+ CERROR("Unknown intent %d\n", it);
+ return "UNKNOWN";
+ }
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+ ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ * - one special one for allocation, dec'd only once in destroy
+ * - one for being a lock that's in-use
+ * - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+ atomic_inc(&lock->l_refc);
+ return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+ LASSERT(lock->l_resource != LP_POISON);
+ LASSERT(atomic_read(&lock->l_refc) > 0);
+ if (atomic_dec_and_test(&lock->l_refc)) {
+ struct ldlm_resource *res;
+
+ LDLM_DEBUG(lock,
+ "final lock_put on destroyed lock, freeing it.");
+
+ res = lock->l_resource;
+ LASSERT(lock->l_flags & LDLM_FL_DESTROYED);
+ LASSERT(list_empty(&lock->l_res_link));
+ LASSERT(list_empty(&lock->l_pending_chain));
+
+ lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+ LDLM_NSS_LOCKS);
+ lu_ref_del(&res->lr_reference, "lock", lock);
+ ldlm_resource_putref(res);
+ lock->l_resource = NULL;
+ if (lock->l_export) {
+ class_export_lock_put(lock->l_export, lock);
+ lock->l_export = NULL;
+ }
+
+ if (lock->l_lvb_data != NULL)
+ OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
+
+ ldlm_interval_free(ldlm_interval_detach(lock));
+ lu_ref_fini(&lock->l_reference);
+ OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
+ }
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+ int rc = 0;
+
+ if (!list_empty(&lock->l_lru)) {
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+ list_del_init(&lock->l_lru);
+ LASSERT(ns->ns_nr_unused > 0);
+ ns->ns_nr_unused--;
+ rc = 1;
+ }
+ return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ */
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+ int rc;
+
+ if (lock->l_flags & LDLM_FL_NS_SRV) {
+ LASSERT(list_empty(&lock->l_lru));
+ return 0;
+ }
+
+ spin_lock(&ns->ns_lock);
+ rc = ldlm_lock_remove_from_lru_nolock(lock);
+ spin_unlock(&ns->ns_lock);
+ return rc;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ lock->l_last_used = cfs_time_current();
+ LASSERT(list_empty(&lock->l_lru));
+ LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+ list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+ if (lock->l_flags & LDLM_FL_SKIPPED)
+ lock->l_flags &= ~LDLM_FL_SKIPPED;
+ LASSERT(ns->ns_nr_unused >= 0);
+ ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ spin_lock(&ns->ns_lock);
+ ldlm_lock_add_to_lru_nolock(lock);
+ spin_unlock(&ns->ns_lock);
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+ struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+ if (lock->l_flags & LDLM_FL_NS_SRV) {
+ LASSERT(list_empty(&lock->l_lru));
+ return;
+ }
+
+ spin_lock(&ns->ns_lock);
+ if (!list_empty(&lock->l_lru)) {
+ ldlm_lock_remove_from_lru_nolock(lock);
+ ldlm_lock_add_to_lru_nolock(lock);
+ }
+ spin_unlock(&ns->ns_lock);
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1. Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list. Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die. Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore. -phil
+ */
+int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+ if (lock->l_readers || lock->l_writers) {
+ LDLM_ERROR(lock, "lock still has references");
+ LBUG();
+ }
+
+ if (!list_empty(&lock->l_res_link)) {
+ LDLM_ERROR(lock, "lock still on resource");
+ LBUG();
+ }
+
+ if (lock->l_flags & LDLM_FL_DESTROYED) {
+ LASSERT(list_empty(&lock->l_lru));
+ return 0;
+ }
+ lock->l_flags |= LDLM_FL_DESTROYED;
+
+ if (lock->l_export && lock->l_export->exp_lock_hash) {
+ /* NB: it's safe to call cfs_hash_del() even lock isn't
+ * in exp_lock_hash. */
+ /* In the function below, .hs_keycmp resolves to
+ * ldlm_export_lock_keycmp() */
+ /* coverity[overrun-buffer-val] */
+ cfs_hash_del(lock->l_export->exp_lock_hash,
+ &lock->l_remote_handle, &lock->l_exp_hash);
+ }
+
+ ldlm_lock_remove_from_lru(lock);
+ class_handle_unhash(&lock->l_handle);
+
+#if 0
+ /* Wake anyone waiting for this lock */
+ /* FIXME: I should probably add yet another flag, instead of using
+ * l_export to only call this on clients */
+ if (lock->l_export)
+ class_export_put(lock->l_export);
+ lock->l_export = NULL;
+ if (lock->l_export && lock->l_completion_ast)
+ lock->l_completion_ast(lock, 0);
+#endif
+ return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+ int first;
+
+ lock_res_and_lock(lock);
+ first = ldlm_lock_destroy_internal(lock);
+ unlock_res_and_lock(lock);
+
+ /* drop reference from hashtable only for first destroy */
+ if (first) {
+ lu_ref_del(&lock->l_reference, "hash", lock);
+ LDLM_LOCK_RELEASE(lock);
+ }
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+ int first;
+
+ first = ldlm_lock_destroy_internal(lock);
+ /* drop reference from hashtable only for first destroy */
+ if (first) {
+ lu_ref_del(&lock->l_reference, "hash", lock);
+ LDLM_LOCK_RELEASE(lock);
+ }
+}
+
+/* this is called by portals_handle2object with the handle lock taken */
+static void lock_handle_addref(void *lock)
+{
+ LDLM_LOCK_GET((struct ldlm_lock *)lock);
+}
+
+static void lock_handle_free(void *lock, int size)
+{
+ LASSERT(size == sizeof(struct ldlm_lock));
+ OBD_SLAB_FREE(lock, ldlm_lock_slab, size);
+}
+
+struct portals_handle_ops lock_handle_ops = {
+ .hop_addref = lock_handle_addref,
+ .hop_free = lock_handle_free,
+};
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ * new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+ struct ldlm_lock *lock;
+
+ if (resource == NULL)
+ LBUG();
+
+ OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, GFP_NOFS);
+ if (lock == NULL)
+ return NULL;
+
+ spin_lock_init(&lock->l_lock);
+ lock->l_resource = resource;
+ lu_ref_add(&resource->lr_reference, "lock", lock);
+
+ atomic_set(&lock->l_refc, 2);
+ INIT_LIST_HEAD(&lock->l_res_link);
+ INIT_LIST_HEAD(&lock->l_lru);
+ INIT_LIST_HEAD(&lock->l_pending_chain);
+ INIT_LIST_HEAD(&lock->l_bl_ast);
+ INIT_LIST_HEAD(&lock->l_cp_ast);
+ INIT_LIST_HEAD(&lock->l_rk_ast);
+ init_waitqueue_head(&lock->l_waitq);
+ lock->l_blocking_lock = NULL;
+ INIT_LIST_HEAD(&lock->l_sl_mode);
+ INIT_LIST_HEAD(&lock->l_sl_policy);
+ INIT_HLIST_NODE(&lock->l_exp_hash);
+ INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+ lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+ LDLM_NSS_LOCKS);
+ INIT_LIST_HEAD(&lock->l_handle.h_link);
+ class_handle_hash(&lock->l_handle, &lock_handle_ops);
+
+ lu_ref_init(&lock->l_reference);
+ lu_ref_add(&lock->l_reference, "hash", lock);
+ lock->l_callback_timeout = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ INIT_LIST_HEAD(&lock->l_exp_refs_link);
+ lock->l_exp_refs_nr = 0;
+ lock->l_exp_refs_target = NULL;
+#endif
+ INIT_LIST_HEAD(&lock->l_exp_list);
+
+ return lock;
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+ const struct ldlm_res_id *new_resid)
+{
+ struct ldlm_resource *oldres = lock->l_resource;
+ struct ldlm_resource *newres;
+ int type;
+
+ LASSERT(ns_is_client(ns));
+
+ lock_res_and_lock(lock);
+ if (memcmp(new_resid, &lock->l_resource->lr_name,
+ sizeof(lock->l_resource->lr_name)) == 0) {
+ /* Nothing to do */
+ unlock_res_and_lock(lock);
+ return 0;
+ }
+
+ LASSERT(new_resid->name[0] != 0);
+
+ /* This function assumes that the lock isn't on any lists */
+ LASSERT(list_empty(&lock->l_res_link));
+
+ type = oldres->lr_type;
+ unlock_res_and_lock(lock);
+
+ newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+ if (newres == NULL)
+ return -ENOMEM;
+
+ lu_ref_add(&newres->lr_reference, "lock", lock);
+ /*
+ * To flip the lock from the old to the new resource, lock, oldres and
+ * newres have to be locked. Resource spin-locks are nested within
+ * lock->l_lock, and are taken in the memory address order to avoid
+ * dead-locks.
+ */
+ spin_lock(&lock->l_lock);
+ oldres = lock->l_resource;
+ if (oldres < newres) {
+ lock_res(oldres);
+ lock_res_nested(newres, LRT_NEW);
+ } else {
+ lock_res(newres);
+ lock_res_nested(oldres, LRT_NEW);
+ }
+ LASSERT(memcmp(new_resid, &oldres->lr_name,
+ sizeof(oldres->lr_name)) != 0);
+ lock->l_resource = newres;
+ unlock_res(oldres);
+ unlock_res_and_lock(lock);
+
+ /* ...and the flowers are still standing! */
+ lu_ref_del(&oldres->lr_reference, "lock", lock);
+ ldlm_resource_putref(oldres);
+
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_lock_change_resource);
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+ lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ * Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+ __u64 flags)
+{
+ struct ldlm_lock *lock;
+
+ LASSERT(handle);
+
+ lock = class_handle2object(handle->cookie);
+ if (lock == NULL)
+ return NULL;
+
+ /* It's unlikely but possible that someone marked the lock as
+ * destroyed after we did handle2object on it */
+ if (flags == 0 && ((lock->l_flags & LDLM_FL_DESTROYED) == 0)) {
+ lu_ref_add(&lock->l_reference, "handle", current);
+ return lock;
+ }
+
+ lock_res_and_lock(lock);
+
+ LASSERT(lock->l_resource != NULL);
+
+ lu_ref_add_atomic(&lock->l_reference, "handle", current);
+ if (unlikely(lock->l_flags & LDLM_FL_DESTROYED)) {
+ unlock_res_and_lock(lock);
+ CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+ LDLM_LOCK_PUT(lock);
+ return NULL;
+ }
+
+ if (flags && (lock->l_flags & flags)) {
+ unlock_res_and_lock(lock);
+ LDLM_LOCK_PUT(lock);
+ return NULL;
+ }
+
+ if (flags)
+ lock->l_flags |= flags;
+
+ unlock_res_and_lock(lock);
+ return lock;
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+ ldlm_res2desc(lock->l_resource, &desc->l_resource);
+ desc->l_req_mode = lock->l_req_mode;
+ desc->l_granted_mode = lock->l_granted_mode;
+ ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+ &lock->l_policy_data,
+ &desc->l_policy_data);
+}
+EXPORT_SYMBOL(ldlm_lock2desc);
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+ struct list_head *work_list)
+{
+ if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) {
+ LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+ lock->l_flags |= LDLM_FL_AST_SENT;
+ /* If the enqueuing client said so, tell the AST recipient to
+ * discard dirty data, rather than writing back. */
+ if (new->l_flags & LDLM_FL_AST_DISCARD_DATA)
+ lock->l_flags |= LDLM_FL_DISCARD_DATA;
+ LASSERT(list_empty(&lock->l_bl_ast));
+ list_add(&lock->l_bl_ast, work_list);
+ LDLM_LOCK_GET(lock);
+ LASSERT(lock->l_blocking_lock == NULL);
+ lock->l_blocking_lock = LDLM_LOCK_GET(new);
+ }
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list)
+{
+ if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) {
+ lock->l_flags |= LDLM_FL_CP_REQD;
+ LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+ LASSERT(list_empty(&lock->l_cp_ast));
+ list_add(&lock->l_cp_ast, work_list);
+ LDLM_LOCK_GET(lock);
+ }
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+ struct list_head *work_list)
+{
+ check_res_locked(lock->l_resource);
+ if (new)
+ ldlm_add_bl_work_item(lock, new, work_list);
+ else
+ ldlm_add_cp_work_item(lock, work_list);
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode)
+{
+ struct ldlm_lock *lock;
+
+ lock = ldlm_handle2lock(lockh);
+ LASSERT(lock != NULL);
+ ldlm_lock_addref_internal(lock, mode);
+ LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+ ldlm_lock_remove_from_lru(lock);
+ if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+ lock->l_readers++;
+ lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+ }
+ if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+ lock->l_writers++;
+ lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+ }
+ LDLM_LOCK_GET(lock);
+ lu_ref_add_atomic(&lock->l_reference, "user", lock);
+ LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode)
+{
+ struct ldlm_lock *lock;
+ int result;
+
+ result = -EAGAIN;
+ lock = ldlm_handle2lock(lockh);
+ if (lock != NULL) {
+ lock_res_and_lock(lock);
+ if (lock->l_readers != 0 || lock->l_writers != 0 ||
+ !(lock->l_flags & LDLM_FL_CBPENDING)) {
+ ldlm_lock_addref_internal_nolock(lock, mode);
+ result = 0;
+ }
+ unlock_res_and_lock(lock);
+ LDLM_LOCK_PUT(lock);
+ }
+ return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+ lock_res_and_lock(lock);
+ ldlm_lock_addref_internal_nolock(lock, mode);
+ unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accommodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+ LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+ if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+ LASSERT(lock->l_readers > 0);
+ lu_ref_del(&lock->l_reference, "reader", lock);
+ lock->l_readers--;
+ }
+ if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+ LASSERT(lock->l_writers > 0);
+ lu_ref_del(&lock->l_reference, "writer", lock);
+ lock->l_writers--;
+ }
+
+ lu_ref_del(&lock->l_reference, "user", lock);
+ LDLM_LOCK_RELEASE(lock); /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+ struct ldlm_namespace *ns;
+
+ lock_res_and_lock(lock);
+
+ ns = ldlm_lock_to_ns(lock);
+
+ ldlm_lock_decref_internal_nolock(lock, mode);
+
+ if (lock->l_flags & LDLM_FL_LOCAL &&
+ !lock->l_readers && !lock->l_writers) {
+ /* If this is a local lock on a server namespace and this was
+ * the last reference, cancel the lock. */
+ CDEBUG(D_INFO, "forcing cancel of local lock\n");
+ lock->l_flags |= LDLM_FL_CBPENDING;
+ }
+
+ if (!lock->l_readers && !lock->l_writers &&
+ (lock->l_flags & LDLM_FL_CBPENDING)) {
+ /* If we received a blocked AST and this was the last reference,
+ * run the callback. */
+ if ((lock->l_flags & LDLM_FL_NS_SRV) && lock->l_export)
+ CERROR("FL_CBPENDING set on non-local lock--just a warning\n");
+
+ LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+ LDLM_LOCK_GET(lock); /* dropped by bl thread */
+ ldlm_lock_remove_from_lru(lock);
+ unlock_res_and_lock(lock);
+
+ if (lock->l_flags & LDLM_FL_FAIL_LOC)
+ OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+ if ((lock->l_flags & LDLM_FL_ATOMIC_CB) ||
+ ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+ ldlm_handle_bl_callback(ns, NULL, lock);
+ } else if (ns_is_client(ns) &&
+ !lock->l_readers && !lock->l_writers &&
+ !(lock->l_flags & LDLM_FL_NO_LRU) &&
+ !(lock->l_flags & LDLM_FL_BL_AST)) {
+
+ LDLM_DEBUG(lock, "add lock into lru list");
+
+ /* If this is a client-side namespace and this was the last
+ * reference, put it on the LRU. */
+ ldlm_lock_add_to_lru(lock);
+ unlock_res_and_lock(lock);
+
+ if (lock->l_flags & LDLM_FL_FAIL_LOC)
+ OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+ /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+ * are not supported by the server, otherwise, it is done on
+ * enqueue. */
+ if (!exp_connect_cancelset(lock->l_conn_export) &&
+ !ns_connect_lru_resize(ns))
+ ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+ } else {
+ LDLM_DEBUG(lock, "do not add lock into lru list");
+ unlock_res_and_lock(lock);
+ }
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
+{
+ struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+
+ LASSERTF(lock != NULL, "Non-existing lock: %#llx\n", lockh->cookie);
+ ldlm_lock_decref_internal(lock, mode);
+ LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ * Typical usage is for GROUP locks which we cannot allow to be cached.
+ */
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode)
+{
+ struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+
+ LASSERT(lock != NULL);
+
+ LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+ lock_res_and_lock(lock);
+ lock->l_flags |= LDLM_FL_CBPENDING;
+ unlock_res_and_lock(lock);
+ ldlm_lock_decref_internal(lock, mode);
+ LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+ struct list_head *res_link;
+ struct list_head *mode_link;
+ struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ * queue [input]: the granted list where search acts on;
+ * req [input]: the lock whose position to be located;
+ * prev [output]: positions within 3 lists to insert @req to
+ * Return Value:
+ * filled @prev
+ * NOTE: called by
+ * - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+ struct ldlm_lock *req,
+ struct sl_insert_point *prev)
+{
+ struct list_head *tmp;
+ struct ldlm_lock *lock, *mode_end, *policy_end;
+
+ list_for_each(tmp, queue) {
+ lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+ mode_end = list_entry(lock->l_sl_mode.prev,
+ struct ldlm_lock, l_sl_mode);
+
+ if (lock->l_req_mode != req->l_req_mode) {
+ /* jump to last lock of mode group */
+ tmp = &mode_end->l_res_link;
+ continue;
+ }
+
+ /* suitable mode group is found */
+ if (lock->l_resource->lr_type == LDLM_PLAIN) {
+ /* insert point is last lock of the mode group */
+ prev->res_link = &mode_end->l_res_link;
+ prev->mode_link = &mode_end->l_sl_mode;
+ prev->policy_link = &req->l_sl_policy;
+ return;
+ } else if (lock->l_resource->lr_type == LDLM_IBITS) {
+ for (;;) {
+ policy_end =
+ list_entry(lock->l_sl_policy.prev,
+ struct ldlm_lock,
+ l_sl_policy);
+
+ if (lock->l_policy_data.l_inodebits.bits ==
+ req->l_policy_data.l_inodebits.bits) {
+ /* insert point is last lock of
+ * the policy group */
+ prev->res_link =
+ &policy_end->l_res_link;
+ prev->mode_link =
+ &policy_end->l_sl_mode;
+ prev->policy_link =
+ &policy_end->l_sl_policy;
+ return;
+ }
+
+ if (policy_end == mode_end)
+ /* done with mode group */
+ break;
+
+ /* go to next policy group within mode group */
+ tmp = policy_end->l_res_link.next;
+ lock = list_entry(tmp, struct ldlm_lock,
+ l_res_link);
+ } /* loop over policy groups within the mode group */
+
+ /* insert point is last lock of the mode group,
+ * new policy group is started */
+ prev->res_link = &mode_end->l_res_link;
+ prev->mode_link = &mode_end->l_sl_mode;
+ prev->policy_link = &req->l_sl_policy;
+ return;
+ } else {
+ LDLM_ERROR(lock,
+ "is not LDLM_PLAIN or LDLM_IBITS lock");
+ LBUG();
+ }
+ }
+
+ /* insert point is last lock on the queue,
+ * new mode group and new policy group are started */
+ prev->res_link = queue->prev;
+ prev->mode_link = &req->l_sl_mode;
+ prev->policy_link = &req->l_sl_policy;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+ struct sl_insert_point *prev)
+{
+ struct ldlm_resource *res = lock->l_resource;
+
+ check_res_locked(res);
+
+ ldlm_resource_dump(D_INFO, res);
+ LDLM_DEBUG(lock, "About to add lock:");
+
+ if (lock->l_flags & LDLM_FL_DESTROYED) {
+ CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+ return;
+ }
+
+ LASSERT(list_empty(&lock->l_res_link));
+ LASSERT(list_empty(&lock->l_sl_mode));
+ LASSERT(list_empty(&lock->l_sl_policy));
+
+ /*
+ * lock->link == prev->link means lock is first starting the group.
+ * Don't re-add to itself to suppress kernel warnings.
+ */
+ if (&lock->l_res_link != prev->res_link)
+ list_add(&lock->l_res_link, prev->res_link);
+ if (&lock->l_sl_mode != prev->mode_link)
+ list_add(&lock->l_sl_mode, prev->mode_link);
+ if (&lock->l_sl_policy != prev->policy_link)
+ list_add(&lock->l_sl_policy, prev->policy_link);
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+ struct sl_insert_point prev;
+
+ LASSERT(lock->l_req_mode == lock->l_granted_mode);
+
+ search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+ ldlm_granted_list_add_lock(lock, &prev);
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ * - ldlm_lock_enqueue
+ * - ldlm_reprocess_queue
+ * - ldlm_lock_convert
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+ struct ldlm_resource *res = lock->l_resource;
+
+ check_res_locked(res);
+
+ lock->l_granted_mode = lock->l_req_mode;
+ if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+ ldlm_grant_lock_with_skiplist(lock);
+ else if (res->lr_type == LDLM_EXTENT)
+ ldlm_extent_add_lock(res, lock);
+ else
+ ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+ if (lock->l_granted_mode < res->lr_most_restr)
+ res->lr_most_restr = lock->l_granted_mode;
+
+ if (work_list && lock->l_completion_ast != NULL)
+ ldlm_add_ast_work_item(lock, NULL, work_list);
+
+ ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+}
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \retval a referenced lock or NULL. See the flag descriptions below, in the
+ * comment above ldlm_lock_match
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+ ldlm_mode_t *mode,
+ ldlm_policy_data_t *policy,
+ struct ldlm_lock *old_lock,
+ __u64 flags, int unref)
+{
+ struct ldlm_lock *lock;
+ struct list_head *tmp;
+
+ list_for_each(tmp, queue) {
+ ldlm_mode_t match;
+
+ lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+ if (lock == old_lock)
+ break;
+
+ /* Check if this lock can be matched.
+ * Used by LU-2919(exclusive open) for open lease lock */
+ if (ldlm_is_excl(lock))
+ continue;
+
+ /* llite sometimes wants to match locks that will be
+ * canceled when their users drop, but we allow it to match
+ * if it passes in CBPENDING and the lock still has users.
+ * this is generally only going to be used by children
+ * whose parents already hold a lock so forward progress
+ * can still happen. */
+ if (lock->l_flags & LDLM_FL_CBPENDING &&
+ !(flags & LDLM_FL_CBPENDING))
+ continue;
+ if (!unref && lock->l_flags & LDLM_FL_CBPENDING &&
+ lock->l_readers == 0 && lock->l_writers == 0)
+ continue;
+
+ if (!(lock->l_req_mode & *mode))
+ continue;
+ match = lock->l_req_mode;
+
+ if (lock->l_resource->lr_type == LDLM_EXTENT &&
+ (lock->l_policy_data.l_extent.start >
+ policy->l_extent.start ||
+ lock->l_policy_data.l_extent.end < policy->l_extent.end))
+ continue;
+
+ if (unlikely(match == LCK_GROUP) &&
+ lock->l_resource->lr_type == LDLM_EXTENT &&
+ lock->l_policy_data.l_extent.gid != policy->l_extent.gid)
+ continue;
+
+ /* We match if we have existing lock with same or wider set
+ of bits. */
+ if (lock->l_resource->lr_type == LDLM_IBITS &&
+ ((lock->l_policy_data.l_inodebits.bits &
+ policy->l_inodebits.bits) !=
+ policy->l_inodebits.bits))
+ continue;
+
+ if (!unref && (lock->l_flags & LDLM_FL_GONE_MASK))
+ continue;
+
+ if ((flags & LDLM_FL_LOCAL_ONLY) &&
+ !(lock->l_flags & LDLM_FL_LOCAL))
+ continue;
+
+ if (flags & LDLM_FL_TEST_LOCK) {
+ LDLM_LOCK_GET(lock);
+ ldlm_lock_touch_in_lru(lock);
+ } else {
+ ldlm_lock_addref_internal_nolock(lock, match);
+ }
+ *mode = match;
+ return lock;
+ }
+
+ return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+ if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) {
+ lock->l_flags |= LDLM_FL_FAIL_NOTIFIED;
+ wake_up_all(&lock->l_waitq);
+ }
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+ lock_res_and_lock(lock);
+ ldlm_lock_fail_match_locked(lock);
+ unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match);
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+ lock->l_flags |= LDLM_FL_LVB_READY;
+ wake_up_all(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+ lock_res_and_lock(lock);
+ ldlm_lock_allow_match_locked(lock);
+ unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ * server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ * list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ * to be canceled can still be matched as long as they still have reader
+ * or writer referneces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ * just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+ const struct ldlm_res_id *res_id, ldlm_type_t type,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode,
+ struct lustre_handle *lockh, int unref)
+{
+ struct ldlm_resource *res;
+ struct ldlm_lock *lock, *old_lock = NULL;
+ int rc = 0;
+
+ if (ns == NULL) {
+ old_lock = ldlm_handle2lock(lockh);
+ LASSERT(old_lock);
+
+ ns = ldlm_lock_to_ns(old_lock);
+ res_id = &old_lock->l_resource->lr_name;
+ type = old_lock->l_resource->lr_type;
+ mode = old_lock->l_req_mode;
+ }
+
+ res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+ if (res == NULL) {
+ LASSERT(old_lock == NULL);
+ return 0;
+ }
+
+ LDLM_RESOURCE_ADDREF(res);
+ lock_res(res);
+
+ lock = search_queue(&res->lr_granted, &mode, policy, old_lock,
+ flags, unref);
+ if (lock != NULL) {
+ rc = 1;
+ goto out;
+ }
+ if (flags & LDLM_FL_BLOCK_GRANTED) {
+ rc = 0;
+ goto out;
+ }
+ lock = search_queue(&res->lr_converting, &mode, policy, old_lock,
+ flags, unref);
+ if (lock != NULL) {
+ rc = 1;
+ goto out;
+ }
+ lock = search_queue(&res->lr_waiting, &mode, policy, old_lock,
+ flags, unref);
+ if (lock != NULL) {
+ rc = 1;
+ goto out;
+ }
+
+ out:
+ unlock_res(res);
+ LDLM_RESOURCE_DELREF(res);
+ ldlm_resource_putref(res);
+
+ if (lock) {
+ ldlm_lock2handle(lock, lockh);
+ if ((flags & LDLM_FL_LVB_READY) &&
+ (!(lock->l_flags & LDLM_FL_LVB_READY))) {
+ __u64 wait_flags = LDLM_FL_LVB_READY |
+ LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED;
+ struct l_wait_info lwi;
+
+ if (lock->l_completion_ast) {
+ int err = lock->l_completion_ast(lock,
+ LDLM_FL_WAIT_NOREPROC,
+ NULL);
+ if (err) {
+ if (flags & LDLM_FL_TEST_LOCK)
+ LDLM_LOCK_RELEASE(lock);
+ else
+ ldlm_lock_decref_internal(lock,
+ mode);
+ rc = 0;
+ goto out2;
+ }
+ }
+
+ lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+ NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+ /* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
+ l_wait_event(lock->l_waitq,
+ lock->l_flags & wait_flags,
+ &lwi);
+ if (!(lock->l_flags & LDLM_FL_LVB_READY)) {
+ if (flags & LDLM_FL_TEST_LOCK)
+ LDLM_LOCK_RELEASE(lock);
+ else
+ ldlm_lock_decref_internal(lock, mode);
+ rc = 0;
+ }
+ }
+ }
+ out2:
+ if (rc) {
+ LDLM_DEBUG(lock, "matched (%llu %llu)",
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[2] : policy->l_extent.start,
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[3] : policy->l_extent.end);
+
+ /* check user's security context */
+ if (lock->l_conn_export &&
+ sptlrpc_import_check_ctx(
+ class_exp2cliimp(lock->l_conn_export))) {
+ if (!(flags & LDLM_FL_TEST_LOCK))
+ ldlm_lock_decref_internal(lock, mode);
+ rc = 0;
+ }
+
+ if (flags & LDLM_FL_TEST_LOCK)
+ LDLM_LOCK_RELEASE(lock);
+
+ } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+ LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res %llu/%llu (%llu %llu)",
+ ns, type, mode, res_id->name[0],
+ res_id->name[1],
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[2] : policy->l_extent.start,
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[3] : policy->l_extent.end);
+ }
+ if (old_lock)
+ LDLM_LOCK_PUT(old_lock);
+
+ return rc ? mode : 0;
+}
+EXPORT_SYMBOL(ldlm_lock_match);
+
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+ __u64 *bits)
+{
+ struct ldlm_lock *lock;
+ ldlm_mode_t mode = 0;
+
+ lock = ldlm_handle2lock(lockh);
+ if (lock != NULL) {
+ lock_res_and_lock(lock);
+ if (lock->l_flags & LDLM_FL_GONE_MASK)
+ goto out;
+
+ if (lock->l_flags & LDLM_FL_CBPENDING &&
+ lock->l_readers == 0 && lock->l_writers == 0)
+ goto out;
+
+ if (bits)
+ *bits = lock->l_policy_data.l_inodebits.bits;
+ mode = lock->l_granted_mode;
+ ldlm_lock_addref_internal_nolock(lock, mode);
+ }
+
+out:
+ if (lock != NULL) {
+ unlock_res_and_lock(lock);
+ LDLM_LOCK_PUT(lock);
+ }
+ return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+ enum req_location loc, void *data, int size)
+{
+ void *lvb;
+
+ LASSERT(data != NULL);
+ LASSERT(size >= 0);
+
+ switch (lock->l_lvb_type) {
+ case LVB_T_OST:
+ if (size == sizeof(struct ost_lvb)) {
+ if (loc == RCL_CLIENT)
+ lvb = req_capsule_client_swab_get(pill,
+ &RMF_DLM_LVB,
+ lustre_swab_ost_lvb);
+ else
+ lvb = req_capsule_server_swab_get(pill,
+ &RMF_DLM_LVB,
+ lustre_swab_ost_lvb);
+ if (unlikely(lvb == NULL)) {
+ LDLM_ERROR(lock, "no LVB");
+ return -EPROTO;
+ }
+
+ memcpy(data, lvb, size);
+ } else if (size == sizeof(struct ost_lvb_v1)) {
+ struct ost_lvb *olvb = data;
+
+ if (loc == RCL_CLIENT)
+ lvb = req_capsule_client_swab_get(pill,
+ &RMF_DLM_LVB,
+ lustre_swab_ost_lvb_v1);
+ else
+ lvb = req_capsule_server_sized_swab_get(pill,
+ &RMF_DLM_LVB, size,
+ lustre_swab_ost_lvb_v1);
+ if (unlikely(lvb == NULL)) {
+ LDLM_ERROR(lock, "no LVB");
+ return -EPROTO;
+ }
+
+ memcpy(data, lvb, size);
+ olvb->lvb_mtime_ns = 0;
+ olvb->lvb_atime_ns = 0;
+ olvb->lvb_ctime_ns = 0;
+ } else {
+ LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+ size);
+ return -EINVAL;
+ }
+ break;
+ case LVB_T_LQUOTA:
+ if (size == sizeof(struct lquota_lvb)) {
+ if (loc == RCL_CLIENT)
+ lvb = req_capsule_client_swab_get(pill,
+ &RMF_DLM_LVB,
+ lustre_swab_lquota_lvb);
+ else
+ lvb = req_capsule_server_swab_get(pill,
+ &RMF_DLM_LVB,
+ lustre_swab_lquota_lvb);
+ if (unlikely(lvb == NULL)) {
+ LDLM_ERROR(lock, "no LVB");
+ return -EPROTO;
+ }
+
+ memcpy(data, lvb, size);
+ } else {
+ LDLM_ERROR(lock,
+ "Replied unexpected lquota LVB size %d",
+ size);
+ return -EINVAL;
+ }
+ break;
+ case LVB_T_LAYOUT:
+ if (size == 0)
+ break;
+
+ if (loc == RCL_CLIENT)
+ lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+ else
+ lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+ if (unlikely(lvb == NULL)) {
+ LDLM_ERROR(lock, "no LVB");
+ return -EPROTO;
+ }
+
+ memcpy(data, lvb, size);
+ break;
+ default:
+ LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type);
+ dump_stack();
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_type_t type,
+ ldlm_mode_t mode,
+ const struct ldlm_callback_suite *cbs,
+ void *data, __u32 lvb_len,
+ enum lvb_type lvb_type)
+{
+ struct ldlm_lock *lock;
+ struct ldlm_resource *res;
+
+ res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+ if (res == NULL)
+ return NULL;
+
+ lock = ldlm_lock_new(res);
+
+ if (lock == NULL)
+ return NULL;
+
+ lock->l_req_mode = mode;
+ lock->l_ast_data = data;
+ lock->l_pid = current_pid();
+ if (ns_is_server(ns))
+ lock->l_flags |= LDLM_FL_NS_SRV;
+ if (cbs) {
+ lock->l_blocking_ast = cbs->lcs_blocking;
+ lock->l_completion_ast = cbs->lcs_completion;
+ lock->l_glimpse_ast = cbs->lcs_glimpse;
+ }
+
+ lock->l_tree_node = NULL;
+ /* if this is the extent lock, allocate the interval tree node */
+ if (type == LDLM_EXTENT) {
+ if (ldlm_interval_alloc(lock) == NULL)
+ goto out;
+ }
+
+ if (lvb_len) {
+ lock->l_lvb_len = lvb_len;
+ OBD_ALLOC(lock->l_lvb_data, lvb_len);
+ if (lock->l_lvb_data == NULL)
+ goto out;
+ }
+
+ lock->l_lvb_type = lvb_type;
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+ goto out;
+
+ return lock;
+
+out:
+ ldlm_lock_destroy(lock);
+ LDLM_LOCK_RELEASE(lock);
+ return NULL;
+}
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
+ struct ldlm_lock **lockp,
+ void *cookie, __u64 *flags)
+{
+ struct ldlm_lock *lock = *lockp;
+ struct ldlm_resource *res = lock->l_resource;
+ int local = ns_is_client(ldlm_res_to_ns(res));
+ ldlm_error_t rc = ELDLM_OK;
+ struct ldlm_interval *node = NULL;
+
+ lock->l_last_activity = get_seconds();
+ /* policies are not executed on the client or during replay */
+ if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+ && !local && ns->ns_policy) {
+ rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+ NULL);
+ if (rc == ELDLM_LOCK_REPLACED) {
+ /* The lock that was returned has already been granted,
+ * and placed into lockp. If it's not the same as the
+ * one we passed in, then destroy the old one and our
+ * work here is done. */
+ if (lock != *lockp) {
+ ldlm_lock_destroy(lock);
+ LDLM_LOCK_RELEASE(lock);
+ }
+ *flags |= LDLM_FL_LOCK_CHANGED;
+ return 0;
+ } else if (rc != ELDLM_OK ||
+ (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+ ldlm_lock_destroy(lock);
+ return rc;
+ }
+ }
+
+ /* For a replaying lock, it might be already in granted list. So
+ * unlinking the lock will cause the interval node to be freed, we
+ * have to allocate the interval node early otherwise we can't regrant
+ * this lock in the future. - jay */
+ if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+ OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+
+ lock_res_and_lock(lock);
+ if (local && lock->l_req_mode == lock->l_granted_mode) {
+ /* The server returned a blocked lock, but it was granted
+ * before we got a chance to actually enqueue it. We don't
+ * need to do anything else. */
+ *flags &= ~(LDLM_FL_BLOCK_GRANTED |
+ LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
+ goto out;
+ }
+
+ ldlm_resource_unlink_lock(lock);
+ if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+ if (node == NULL) {
+ ldlm_lock_destroy_nolock(lock);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&node->li_group);
+ ldlm_interval_attach(node, lock);
+ node = NULL;
+ }
+
+ /* Some flags from the enqueue want to make it into the AST, via the
+ * lock's l_flags. */
+ lock->l_flags |= *flags & LDLM_FL_AST_DISCARD_DATA;
+
+ /* This distinction between local lock trees is very important; a client
+ * namespace only has information about locks taken by that client, and
+ * thus doesn't have enough information to decide for itself if it can
+ * be granted (below). In this case, we do exactly what the server
+ * tells us to do, as dictated by the 'flags'.
+ *
+ * We do exactly the same thing during recovery, when the server is
+ * more or less trusting the clients not to lie.
+ *
+ * FIXME (bug 268): Detect obvious lies by checking compatibility in
+ * granted/converting queues. */
+ if (local) {
+ if (*flags & LDLM_FL_BLOCK_CONV)
+ ldlm_resource_add_lock(res, &res->lr_converting, lock);
+ else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+ ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+ else
+ ldlm_grant_lock(lock, NULL);
+ goto out;
+ } else {
+ CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n");
+ LBUG();
+ }
+
+out:
+ unlock_res_and_lock(lock);
+ if (node)
+ OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+ return rc;
+}
+
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+ struct ldlm_cb_set_arg *arg = opaq;
+ struct ldlm_lock_desc d;
+ int rc;
+ struct ldlm_lock *lock;
+
+ if (list_empty(arg->list))
+ return -ENOENT;
+
+ lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+ /* nobody should touch l_bl_ast */
+ lock_res_and_lock(lock);
+ list_del_init(&lock->l_bl_ast);
+
+ LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
+ LASSERT(lock->l_bl_ast_run == 0);
+ LASSERT(lock->l_blocking_lock);
+ lock->l_bl_ast_run++;
+ unlock_res_and_lock(lock);
+
+ ldlm_lock2desc(lock->l_blocking_lock, &d);
+
+ rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+ LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+ lock->l_blocking_lock = NULL;
+ LDLM_LOCK_RELEASE(lock);
+
+ return rc;
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+ struct ldlm_cb_set_arg *arg = opaq;
+ int rc = 0;
+ struct ldlm_lock *lock;
+ ldlm_completion_callback completion_callback;
+
+ if (list_empty(arg->list))
+ return -ENOENT;
+
+ lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+ /* It's possible to receive a completion AST before we've set
+ * the l_completion_ast pointer: either because the AST arrived
+ * before the reply, or simply because there's a small race
+ * window between receiving the reply and finishing the local
+ * enqueue. (bug 842)
+ *
+ * This can't happen with the blocking_ast, however, because we
+ * will never call the local blocking_ast until we drop our
+ * reader/writer reference, which we won't do until we get the
+ * reply and finish enqueueing. */
+
+ /* nobody should touch l_cp_ast */
+ lock_res_and_lock(lock);
+ list_del_init(&lock->l_cp_ast);
+ LASSERT(lock->l_flags & LDLM_FL_CP_REQD);
+ /* save l_completion_ast since it can be changed by
+ * mds_intent_policy(), see bug 14225 */
+ completion_callback = lock->l_completion_ast;
+ lock->l_flags &= ~LDLM_FL_CP_REQD;
+ unlock_res_and_lock(lock);
+
+ if (completion_callback != NULL)
+ rc = completion_callback(lock, 0, (void *)arg);
+ LDLM_LOCK_RELEASE(lock);
+
+ return rc;
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+ struct ldlm_cb_set_arg *arg = opaq;
+ struct ldlm_lock_desc desc;
+ int rc;
+ struct ldlm_lock *lock;
+
+ if (list_empty(arg->list))
+ return -ENOENT;
+
+ lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+ list_del_init(&lock->l_rk_ast);
+
+ /* the desc just pretend to exclusive */
+ ldlm_lock2desc(lock, &desc);
+ desc.l_req_mode = LCK_EX;
+ desc.l_granted_mode = 0;
+
+ rc = lock->l_blocking_ast(lock, &desc, (void *)arg, LDLM_CB_BLOCKING);
+ LDLM_LOCK_RELEASE(lock);
+
+ return rc;
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+ struct ldlm_cb_set_arg *arg = opaq;
+ struct ldlm_glimpse_work *gl_work;
+ struct ldlm_lock *lock;
+ int rc = 0;
+
+ if (list_empty(arg->list))
+ return -ENOENT;
+
+ gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+ gl_list);
+ list_del_init(&gl_work->gl_list);
+
+ lock = gl_work->gl_lock;
+
+ /* transfer the glimpse descriptor to ldlm_cb_set_arg */
+ arg->gl_desc = gl_work->gl_desc;
+
+ /* invoke the actual glimpse callback */
+ if (lock->l_glimpse_ast(lock, (void *)arg) == 0)
+ rc = 1;
+
+ LDLM_LOCK_RELEASE(lock);
+
+ if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+ OBD_FREE_PTR(gl_work);
+
+ return rc;
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+ enum ldlm_desc_ast_t ast_type)
+{
+ struct ldlm_cb_set_arg *arg;
+ set_producer_func work_ast_lock;
+ int rc;
+
+ if (list_empty(rpc_list))
+ return 0;
+
+ OBD_ALLOC_PTR(arg);
+ if (arg == NULL)
+ return -ENOMEM;
+
+ atomic_set(&arg->restart, 0);
+ arg->list = rpc_list;
+
+ switch (ast_type) {
+ case LDLM_WORK_BL_AST:
+ arg->type = LDLM_BL_CALLBACK;
+ work_ast_lock = ldlm_work_bl_ast_lock;
+ break;
+ case LDLM_WORK_CP_AST:
+ arg->type = LDLM_CP_CALLBACK;
+ work_ast_lock = ldlm_work_cp_ast_lock;
+ break;
+ case LDLM_WORK_REVOKE_AST:
+ arg->type = LDLM_BL_CALLBACK;
+ work_ast_lock = ldlm_work_revoke_ast_lock;
+ break;
+ case LDLM_WORK_GL_AST:
+ arg->type = LDLM_GL_CALLBACK;
+ work_ast_lock = ldlm_work_gl_ast_lock;
+ break;
+ default:
+ LBUG();
+ }
+
+ /* We create a ptlrpc request set with flow control extension.
+ * This request set will use the work_ast_lock function to produce new
+ * requests and will send a new request each time one completes in order
+ * to keep the number of requests in flight to ns_max_parallel_ast */
+ arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+ work_ast_lock, arg);
+ if (arg->set == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ ptlrpc_set_wait(arg->set);
+ ptlrpc_set_destroy(arg->set);
+
+ rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+ goto out;
+out:
+ OBD_FREE_PTR(arg);
+ return rc;
+}
+
+static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
+{
+ ldlm_reprocess_all(res);
+ return LDLM_ITER_CONTINUE;
+}
+
+static int ldlm_reprocess_res(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg)
+{
+ struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+ int rc;
+
+ rc = reprocess_one_queue(res, arg);
+
+ return rc == LDLM_ITER_STOP;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
+{
+ if (ns != NULL) {
+ cfs_hash_for_each_nolock(ns->ns_rs_hash,
+ ldlm_reprocess_res, NULL);
+ }
+}
+EXPORT_SYMBOL(ldlm_reprocess_all_ns);
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+void ldlm_reprocess_all(struct ldlm_resource *res)
+{
+ LIST_HEAD(rpc_list);
+
+ if (!ns_is_client(ldlm_res_to_ns(res))) {
+ CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n");
+ LBUG();
+ }
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+ check_res_locked(lock->l_resource);
+ if (!(lock->l_flags & LDLM_FL_CANCEL)) {
+ lock->l_flags |= LDLM_FL_CANCEL;
+ if (lock->l_blocking_ast) {
+ unlock_res_and_lock(lock);
+ lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+ LDLM_CB_CANCELING);
+ lock_res_and_lock(lock);
+ } else {
+ LDLM_DEBUG(lock, "no blocking ast");
+ }
+ }
+ lock->l_flags |= LDLM_FL_BL_DONE;
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+ if (req->l_resource->lr_type != LDLM_PLAIN &&
+ req->l_resource->lr_type != LDLM_IBITS)
+ return;
+
+ list_del_init(&req->l_sl_policy);
+ list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+ struct ldlm_resource *res;
+ struct ldlm_namespace *ns;
+
+ lock_res_and_lock(lock);
+
+ res = lock->l_resource;
+ ns = ldlm_res_to_ns(res);
+
+ /* Please do not, no matter how tempting, remove this LBUG without
+ * talking to me first. -phik */
+ if (lock->l_readers || lock->l_writers) {
+ LDLM_ERROR(lock, "lock still has references");
+ LBUG();
+ }
+
+ if (lock->l_flags & LDLM_FL_WAITED)
+ ldlm_del_waiting_lock(lock);
+
+ /* Releases cancel callback. */
+ ldlm_cancel_callback(lock);
+
+ /* Yes, second time, just in case it was added again while we were
+ * running with no res lock in ldlm_cancel_callback */
+ if (lock->l_flags & LDLM_FL_WAITED)
+ ldlm_del_waiting_lock(lock);
+
+ ldlm_resource_unlink_lock(lock);
+ ldlm_lock_destroy_nolock(lock);
+
+ if (lock->l_granted_mode == lock->l_req_mode)
+ ldlm_pool_del(&ns->ns_pool, lock);
+
+ /* Make sure we will not be called again for same lock what is possible
+ * if not to zero out lock->l_granted_mode */
+ lock->l_granted_mode = LCK_MINMODE;
+ unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
+{
+ struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+ int rc = -EINVAL;
+
+ if (lock) {
+ if (lock->l_ast_data == NULL)
+ lock->l_ast_data = data;
+ if (lock->l_ast_data == data)
+ rc = 0;
+ LDLM_LOCK_PUT(lock);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+ struct obd_export *ecl_exp;
+ int ecl_loop;
+};
+
+/**
+ * Iterator function for ldlm_cancel_locks_for_export.
+ * Cancels passed locks.
+ */
+int ldlm_cancel_locks_for_export_cb(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *data)
+
+{
+ struct export_cl_data *ecl = (struct export_cl_data *)data;
+ struct obd_export *exp = ecl->ecl_exp;
+ struct ldlm_lock *lock = cfs_hash_object(hs, hnode);
+ struct ldlm_resource *res;
+
+ res = ldlm_resource_getref(lock->l_resource);
+ LDLM_LOCK_GET(lock);
+
+ LDLM_DEBUG(lock, "export %p", exp);
+ ldlm_res_lvbo_update(res, NULL, 1);
+ ldlm_lock_cancel(lock);
+ ldlm_reprocess_all(res);
+ ldlm_resource_putref(res);
+ LDLM_LOCK_RELEASE(lock);
+
+ ecl->ecl_loop++;
+ if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+ CDEBUG(D_INFO,
+ "Cancel lock %p for export %p (loop %d), still have %d locks left on hash table.\n",
+ lock, exp, ecl->ecl_loop,
+ atomic_read(&hs->hs_count));
+ }
+
+ return 0;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+void ldlm_cancel_locks_for_export(struct obd_export *exp)
+{
+ struct export_cl_data ecl = {
+ .ecl_exp = exp,
+ .ecl_loop = 0,
+ };
+
+ cfs_hash_for_each_empty(exp->exp_lock_hash,
+ ldlm_cancel_locks_for_export_cb, &ecl);
+}
+
+/**
+ * Downgrade an exclusive lock.
+ *
+ * A fast variant of ldlm_lock_convert for conversion of exclusive
+ * locks. The conversion is always successful.
+ * Used by Commit on Sharing (COS) code.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode)
+{
+ LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX));
+ LASSERT(new_mode == LCK_COS);
+
+ lock_res_and_lock(lock);
+ ldlm_resource_unlink_lock(lock);
+ /*
+ * Remove the lock from pool as it will be added again in
+ * ldlm_grant_lock() called below.
+ */
+ ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+ lock->l_req_mode = new_mode;
+ ldlm_grant_lock(lock, NULL);
+ unlock_res_and_lock(lock);
+ ldlm_reprocess_all(lock->l_resource);
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+ __u32 *flags)
+{
+ LIST_HEAD(rpc_list);
+ struct ldlm_resource *res;
+ struct ldlm_namespace *ns;
+ int granted = 0;
+ struct ldlm_interval *node;
+
+ /* Just return if mode is unchanged. */
+ if (new_mode == lock->l_granted_mode) {
+ *flags |= LDLM_FL_BLOCK_GRANTED;
+ return lock->l_resource;
+ }
+
+ /* I can't check the type of lock here because the bitlock of lock
+ * is not held here, so do the allocation blindly. -jay */
+ OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, GFP_NOFS);
+ if (node == NULL)
+ /* Actually, this causes EDEADLOCK to be returned */
+ return NULL;
+
+ LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+ "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+ lock_res_and_lock(lock);
+
+ res = lock->l_resource;
+ ns = ldlm_res_to_ns(res);
+
+ lock->l_req_mode = new_mode;
+ if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+ ldlm_resource_unlink_lock(lock);
+ } else {
+ ldlm_resource_unlink_lock(lock);
+ if (res->lr_type == LDLM_EXTENT) {
+ /* FIXME: ugly code, I have to attach the lock to a
+ * interval node again since perhaps it will be granted
+ * soon */
+ INIT_LIST_HEAD(&node->li_group);
+ ldlm_interval_attach(node, lock);
+ node = NULL;
+ }
+ }
+
+ /*
+ * Remove old lock from the pool before adding the lock with new
+ * mode below in ->policy()
+ */
+ ldlm_pool_del(&ns->ns_pool, lock);
+
+ /* If this is a local resource, put it on the appropriate list. */
+ if (ns_is_client(ldlm_res_to_ns(res))) {
+ if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+ ldlm_resource_add_lock(res, &res->lr_converting, lock);
+ } else {
+ /* This should never happen, because of the way the
+ * server handles conversions. */
+ LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+ *flags);
+ LBUG();
+
+ ldlm_grant_lock(lock, &rpc_list);
+ granted = 1;
+ /* FIXME: completion handling not with lr_lock held ! */
+ if (lock->l_completion_ast)
+ lock->l_completion_ast(lock, 0, NULL);
+ }
+ } else {
+ CERROR("This is client-side-only module, cannot handle LDLM_NAMESPACE_SERVER resource type lock.\n");
+ LBUG();
+ }
+ unlock_res_and_lock(lock);
+
+ if (granted)
+ ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+ if (node)
+ OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+ return res;
+}
+EXPORT_SYMBOL(ldlm_lock_convert);
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
+{
+ struct ldlm_lock *lock;
+
+ if (!((libcfs_debug | D_ERROR) & level))
+ return;
+
+ lock = ldlm_handle2lock(lockh);
+ if (lock == NULL)
+ return;
+
+ LDLM_DEBUG_LIMIT(level, lock, "###");
+
+ LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+ struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct obd_export *exp = lock->l_export;
+ struct ldlm_resource *resource = lock->l_resource;
+ char *nid = "local";
+
+ va_start(args, fmt);
+
+ if (exp && exp->exp_connection) {
+ nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+ } else if (exp && exp->exp_obd != NULL) {
+ struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+
+ nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+ }
+
+ if (resource == NULL) {
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+ lock,
+ lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+ lock->l_readers, lock->l_writers,
+ ldlm_lockname[lock->l_granted_mode],
+ ldlm_lockname[lock->l_req_mode],
+ lock->l_flags, nid, lock->l_remote_handle.cookie,
+ exp ? atomic_read(&exp->exp_refcount) : -99,
+ lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+ va_end(args);
+ return;
+ }
+
+ switch (resource->lr_type) {
+ case LDLM_EXTENT:
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+ ldlm_lock_to_ns_name(lock), lock,
+ lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+ lock->l_readers, lock->l_writers,
+ ldlm_lockname[lock->l_granted_mode],
+ ldlm_lockname[lock->l_req_mode],
+ PLDLMRES(resource),
+ atomic_read(&resource->lr_refcount),
+ ldlm_typename[resource->lr_type],
+ lock->l_policy_data.l_extent.start,
+ lock->l_policy_data.l_extent.end,
+ lock->l_req_extent.start, lock->l_req_extent.end,
+ lock->l_flags, nid, lock->l_remote_handle.cookie,
+ exp ? atomic_read(&exp->exp_refcount) : -99,
+ lock->l_pid, lock->l_callback_timeout,
+ lock->l_lvb_type);
+ break;
+
+ case LDLM_FLOCK:
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu\n",
+ ldlm_lock_to_ns_name(lock), lock,
+ lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+ lock->l_readers, lock->l_writers,
+ ldlm_lockname[lock->l_granted_mode],
+ ldlm_lockname[lock->l_req_mode],
+ PLDLMRES(resource),
+ atomic_read(&resource->lr_refcount),
+ ldlm_typename[resource->lr_type],
+ lock->l_policy_data.l_flock.pid,
+ lock->l_policy_data.l_flock.start,
+ lock->l_policy_data.l_flock.end,
+ lock->l_flags, nid, lock->l_remote_handle.cookie,
+ exp ? atomic_read(&exp->exp_refcount) : -99,
+ lock->l_pid, lock->l_callback_timeout);
+ break;
+
+ case LDLM_IBITS:
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+ ldlm_lock_to_ns_name(lock),
+ lock, lock->l_handle.h_cookie,
+ atomic_read(&lock->l_refc),
+ lock->l_readers, lock->l_writers,
+ ldlm_lockname[lock->l_granted_mode],
+ ldlm_lockname[lock->l_req_mode],
+ PLDLMRES(resource),
+ lock->l_policy_data.l_inodebits.bits,
+ atomic_read(&resource->lr_refcount),
+ ldlm_typename[resource->lr_type],
+ lock->l_flags, nid, lock->l_remote_handle.cookie,
+ exp ? atomic_read(&exp->exp_refcount) : -99,
+ lock->l_pid, lock->l_callback_timeout,
+ lock->l_lvb_type);
+ break;
+
+ default:
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+ ldlm_lock_to_ns_name(lock),
+ lock, lock->l_handle.h_cookie,
+ atomic_read(&lock->l_refc),
+ lock->l_readers, lock->l_writers,
+ ldlm_lockname[lock->l_granted_mode],
+ ldlm_lockname[lock->l_req_mode],
+ PLDLMRES(resource),
+ atomic_read(&resource->lr_refcount),
+ ldlm_typename[resource->lr_type],
+ lock->l_flags, nid, lock->l_remote_handle.cookie,
+ exp ? atomic_read(&exp->exp_refcount) : -99,
+ lock->l_pid, lock->l_callback_timeout,
+ lock->l_lvb_type);
+ break;
+ }
+ va_end(args);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
new file mode 100644
index 000000000..08a91f5d9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -0,0 +1,1191 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+module_param(ldlm_num_threads, int, 0444);
+MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start");
+
+static char *ldlm_cpts;
+module_param(ldlm_cpts, charp, 0444);
+MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on");
+
+static struct mutex ldlm_ref_mutex;
+static int ldlm_refcount;
+
+struct ldlm_cb_async_args {
+ struct ldlm_cb_set_arg *ca_set_arg;
+ struct ldlm_lock *ca_lock;
+};
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+inline unsigned long round_timeout(unsigned long timeout)
+{
+ return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
+{
+ /* Non-AT value */
+ unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+
+ return timeout < 1 ? 1 : timeout;
+}
+
+#define ELT_STOPPED 0
+#define ELT_READY 1
+#define ELT_TERMINATE 2
+
+struct ldlm_bl_pool {
+ spinlock_t blp_lock;
+
+ /*
+ * blp_prio_list is used for callbacks that should be handled
+ * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+ * see bug 13843
+ */
+ struct list_head blp_prio_list;
+
+ /*
+ * blp_list is used for all other callbacks which are likely
+ * to take longer to process.
+ */
+ struct list_head blp_list;
+
+ wait_queue_head_t blp_waitq;
+ struct completion blp_comp;
+ atomic_t blp_num_threads;
+ atomic_t blp_busy_threads;
+ int blp_min_threads;
+ int blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+ struct list_head blwi_entry;
+ struct ldlm_namespace *blwi_ns;
+ struct ldlm_lock_desc blwi_ld;
+ struct ldlm_lock *blwi_lock;
+ struct list_head blwi_head;
+ int blwi_count;
+ struct completion blwi_comp;
+ ldlm_cancel_flags_t blwi_flags;
+ int blwi_mem_pressure;
+};
+
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+ return 0;
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+ return 0;
+}
+
+
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+ struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+ int do_ast;
+
+ LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+ lock_res_and_lock(lock);
+ lock->l_flags |= LDLM_FL_CBPENDING;
+
+ if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)
+ lock->l_flags |= LDLM_FL_CANCEL;
+
+ do_ast = !lock->l_readers && !lock->l_writers;
+ unlock_res_and_lock(lock);
+
+ if (do_ast) {
+ CDEBUG(D_DLMTRACE,
+ "Lock %p already unused, calling callback (%p)\n", lock,
+ lock->l_blocking_ast);
+ if (lock->l_blocking_ast != NULL)
+ lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+ LDLM_CB_BLOCKING);
+ } else {
+ CDEBUG(D_DLMTRACE,
+ "Lock %p is referenced, will be cancelled later\n",
+ lock);
+ }
+
+ LDLM_DEBUG(lock, "client blocking callback handler END");
+ LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+ struct ldlm_namespace *ns,
+ struct ldlm_request *dlm_req,
+ struct ldlm_lock *lock)
+{
+ int lvb_len;
+ LIST_HEAD(ast_list);
+ int rc = 0;
+
+ LDLM_DEBUG(lock, "client completion callback handler START");
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+ int to = cfs_time_seconds(1);
+
+ while (to > 0) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(to);
+ if (lock->l_granted_mode == lock->l_req_mode ||
+ lock->l_flags & LDLM_FL_DESTROYED)
+ break;
+ }
+ }
+
+ lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+ if (lvb_len < 0) {
+ LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+ rc = lvb_len;
+ goto out;
+ } else if (lvb_len > 0) {
+ if (lock->l_lvb_len > 0) {
+ /* for extent lock, lvb contains ost_lvb{}. */
+ LASSERT(lock->l_lvb_data != NULL);
+
+ if (unlikely(lock->l_lvb_len < lvb_len)) {
+ LDLM_ERROR(lock, "Replied LVB is larger than expectation, expected = %d, replied = %d",
+ lock->l_lvb_len, lvb_len);
+ rc = -EINVAL;
+ goto out;
+ }
+ } else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has
+ * variable length */
+ void *lvb_data;
+
+ OBD_ALLOC(lvb_data, lvb_len);
+ if (lvb_data == NULL) {
+ LDLM_ERROR(lock, "No memory: %d.\n", lvb_len);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ lock_res_and_lock(lock);
+ LASSERT(lock->l_lvb_data == NULL);
+ lock->l_lvb_type = LVB_T_LAYOUT;
+ lock->l_lvb_data = lvb_data;
+ lock->l_lvb_len = lvb_len;
+ unlock_res_and_lock(lock);
+ }
+ }
+
+ lock_res_and_lock(lock);
+ if ((lock->l_flags & LDLM_FL_DESTROYED) ||
+ lock->l_granted_mode == lock->l_req_mode) {
+ /* bug 11300: the lock has already been granted */
+ unlock_res_and_lock(lock);
+ LDLM_DEBUG(lock, "Double grant race happened");
+ rc = 0;
+ goto out;
+ }
+
+ /* If we receive the completion AST before the actual enqueue returned,
+ * then we might need to switch lock modes, resources, or extents. */
+ if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+ lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+ LDLM_DEBUG(lock, "completion AST, new lock mode");
+ }
+
+ if (lock->l_resource->lr_type != LDLM_PLAIN) {
+ ldlm_convert_policy_to_local(req->rq_export,
+ dlm_req->lock_desc.l_resource.lr_type,
+ &dlm_req->lock_desc.l_policy_data,
+ &lock->l_policy_data);
+ LDLM_DEBUG(lock, "completion AST, new policy data");
+ }
+
+ ldlm_resource_unlink_lock(lock);
+ if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+ &lock->l_resource->lr_name,
+ sizeof(lock->l_resource->lr_name)) != 0) {
+ unlock_res_and_lock(lock);
+ rc = ldlm_lock_change_resource(ns, lock,
+ &dlm_req->lock_desc.l_resource.lr_name);
+ if (rc < 0) {
+ LDLM_ERROR(lock, "Failed to allocate resource");
+ goto out;
+ }
+ LDLM_DEBUG(lock, "completion AST, new resource");
+ CERROR("change resource!\n");
+ lock_res_and_lock(lock);
+ }
+
+ if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+ /* BL_AST locks are not needed in LRU.
+ * Let ldlm_cancel_lru() be fast. */
+ ldlm_lock_remove_from_lru(lock);
+ lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+ LDLM_DEBUG(lock, "completion AST includes blocking AST");
+ }
+
+ if (lock->l_lvb_len > 0) {
+ rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+ lock->l_lvb_data, lvb_len);
+ if (rc < 0) {
+ unlock_res_and_lock(lock);
+ goto out;
+ }
+ }
+
+ ldlm_grant_lock(lock, &ast_list);
+ unlock_res_and_lock(lock);
+
+ LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+ /* Let Enqueue to call osc_lock_upcall() and initialize
+ * l_ast_data */
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+ ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+ LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+ lock);
+ goto out;
+
+out:
+ if (rc < 0) {
+ lock_res_and_lock(lock);
+ lock->l_flags |= LDLM_FL_FAILED;
+ unlock_res_and_lock(lock);
+ wake_up(&lock->l_waitq);
+ }
+ LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side. After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+ struct ldlm_namespace *ns,
+ struct ldlm_request *dlm_req,
+ struct ldlm_lock *lock)
+{
+ int rc = -ENOSYS;
+
+ LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+ if (lock->l_glimpse_ast != NULL)
+ rc = lock->l_glimpse_ast(lock, req);
+
+ if (req->rq_repmsg != NULL) {
+ ptlrpc_reply(req);
+ } else {
+ req->rq_status = rc;
+ ptlrpc_error(req);
+ }
+
+ lock_res_and_lock(lock);
+ if (lock->l_granted_mode == LCK_PW &&
+ !lock->l_readers && !lock->l_writers &&
+ cfs_time_after(cfs_time_current(),
+ cfs_time_add(lock->l_last_used,
+ cfs_time_seconds(10)))) {
+ unlock_res_and_lock(lock);
+ if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+ ldlm_handle_bl_callback(ns, NULL, lock);
+
+ return;
+ }
+ unlock_res_and_lock(lock);
+ LDLM_LOCK_RELEASE(lock);
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+ if (req->rq_no_reply)
+ return 0;
+
+ req->rq_status = rc;
+ if (!req->rq_packed_final) {
+ rc = lustre_pack_reply(req, 1, NULL, NULL);
+ if (rc)
+ return rc;
+ }
+ return ptlrpc_reply(req);
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+ ldlm_cancel_flags_t cancel_flags)
+{
+ struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+ spin_lock(&blp->blp_lock);
+ if (blwi->blwi_lock &&
+ blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) {
+ /* add LDLM_FL_DISCARD_DATA requests to the priority list */
+ list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+ } else {
+ /* other blocking callbacks are added to the regular list */
+ list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+ }
+ spin_unlock(&blp->blp_lock);
+
+ wake_up(&blp->blp_waitq);
+
+ /* can not check blwi->blwi_flags as blwi could be already freed in
+ LCF_ASYNC mode */
+ if (!(cancel_flags & LCF_ASYNC))
+ wait_for_completion(&blwi->blwi_comp);
+
+ return 0;
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+ struct ldlm_namespace *ns,
+ struct ldlm_lock_desc *ld,
+ struct list_head *cancels, int count,
+ struct ldlm_lock *lock,
+ ldlm_cancel_flags_t cancel_flags)
+{
+ init_completion(&blwi->blwi_comp);
+ INIT_LIST_HEAD(&blwi->blwi_head);
+
+ if (memory_pressure_get())
+ blwi->blwi_mem_pressure = 1;
+
+ blwi->blwi_ns = ns;
+ blwi->blwi_flags = cancel_flags;
+ if (ld != NULL)
+ blwi->blwi_ld = *ld;
+ if (count) {
+ list_add(&blwi->blwi_head, cancels);
+ list_del_init(cancels);
+ blwi->blwi_count = count;
+ } else {
+ blwi->blwi_lock = lock;
+ }
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread. If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+ struct ldlm_lock_desc *ld,
+ struct ldlm_lock *lock,
+ struct list_head *cancels, int count,
+ ldlm_cancel_flags_t cancel_flags)
+{
+ if (cancels && count == 0)
+ return 0;
+
+ if (cancel_flags & LCF_ASYNC) {
+ struct ldlm_bl_work_item *blwi;
+
+ OBD_ALLOC(blwi, sizeof(*blwi));
+ if (blwi == NULL)
+ return -ENOMEM;
+ init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+ return __ldlm_bl_to_thread(blwi, cancel_flags);
+ } else {
+ /* if it is synchronous call do minimum mem alloc, as it could
+ * be triggered from kernel shrinker
+ */
+ struct ldlm_bl_work_item blwi;
+
+ memset(&blwi, 0, sizeof(blwi));
+ init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+ return __ldlm_bl_to_thread(&blwi, cancel_flags);
+ }
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+ struct ldlm_lock *lock)
+{
+ return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+ struct list_head *cancels, int count,
+ ldlm_cancel_flags_t cancel_flags)
+{
+ return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+ struct obd_device *obd = req->rq_export->exp_obd;
+ char *key;
+ void *val;
+ int keylen, vallen;
+ int rc = -ENOSYS;
+
+ DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name);
+
+ req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+ key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+ if (key == NULL) {
+ DEBUG_REQ(D_IOCTL, req, "no set_info key");
+ return -EFAULT;
+ }
+ keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+ RCL_CLIENT);
+ val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+ if (val == NULL) {
+ DEBUG_REQ(D_IOCTL, req, "no set_info val");
+ return -EFAULT;
+ }
+ vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+ RCL_CLIENT);
+
+ /* We are responsible for swabbing contents of val */
+
+ if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+ /* Pass it on to mdc (the "export" in this case) */
+ rc = obd_set_info_async(req->rq_svc_thread->t_env,
+ req->rq_export,
+ sizeof(KEY_HSM_COPYTOOL_SEND),
+ KEY_HSM_COPYTOOL_SEND,
+ vallen, val, NULL);
+ else
+ DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key);
+
+ return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+ const char *msg, int rc,
+ struct lustre_handle *handle)
+{
+ DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+ "%s: [nid %s] [rc %d] [lock %#llx]",
+ msg, libcfs_id2str(req->rq_peer), rc,
+ handle ? handle->cookie : 0);
+ if (req->rq_no_reply)
+ CWARN("No reply was sent, maybe cause bug 21636.\n");
+ else if (rc)
+ CWARN("Send reply failed, maybe cause bug 21636.\n");
+}
+
+static int ldlm_handle_qc_callback(struct ptlrpc_request *req)
+{
+ struct obd_quotactl *oqctl;
+ struct client_obd *cli = &req->rq_export->exp_obd->u.cli;
+
+ oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+ if (oqctl == NULL) {
+ CERROR("Can't unpack obd_quotactl\n");
+ return -EPROTO;
+ }
+
+ oqctl->qc_stat = ptlrpc_status_ntoh(oqctl->qc_stat);
+
+ cli->cl_qchk_stat = oqctl->qc_stat;
+ return 0;
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+ struct ldlm_namespace *ns;
+ struct ldlm_request *dlm_req;
+ struct ldlm_lock *lock;
+ int rc;
+
+ /* Requests arrive in sender's byte order. The ptlrpc service
+ * handler has already checked and, if necessary, byte-swapped the
+ * incoming request message body, but I am responsible for the
+ * message buffers. */
+
+ /* do nothing for sec context finalize */
+ if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+ return 0;
+
+ req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+ if (req->rq_export == NULL) {
+ rc = ldlm_callback_reply(req, -ENOTCONN);
+ ldlm_callback_errmsg(req, "Operate on unconnected server",
+ rc, NULL);
+ return 0;
+ }
+
+ LASSERT(req->rq_export != NULL);
+ LASSERT(req->rq_export->exp_obd != NULL);
+
+ switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+ case LDLM_BL_CALLBACK:
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+ return 0;
+ break;
+ case LDLM_CP_CALLBACK:
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+ return 0;
+ break;
+ case LDLM_GL_CALLBACK:
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+ return 0;
+ break;
+ case LDLM_SET_INFO:
+ rc = ldlm_handle_setinfo(req);
+ ldlm_callback_reply(req, rc);
+ return 0;
+ case OBD_QC_CALLBACK:
+ req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK);
+ if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET))
+ return 0;
+ rc = ldlm_handle_qc_callback(req);
+ ldlm_callback_reply(req, rc);
+ return 0;
+ default:
+ CERROR("unknown opcode %u\n",
+ lustre_msg_get_opc(req->rq_reqmsg));
+ ldlm_callback_reply(req, -EPROTO);
+ return 0;
+ }
+
+ ns = req->rq_export->exp_obd->obd_namespace;
+ LASSERT(ns != NULL);
+
+ req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+ dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ if (dlm_req == NULL) {
+ rc = ldlm_callback_reply(req, -EPROTO);
+ ldlm_callback_errmsg(req, "Operate without parameter", rc,
+ NULL);
+ return 0;
+ }
+
+ /* Force a known safe race, send a cancel to the server for a lock
+ * which the server has already started a blocking callback on. */
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+ lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+ rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+ if (rc < 0)
+ CERROR("ldlm_cli_cancel: %d\n", rc);
+ }
+
+ lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+ if (!lock) {
+ CDEBUG(D_DLMTRACE, "callback on lock %#llx - lock disappeared\n",
+ dlm_req->lock_handle[0].cookie);
+ rc = ldlm_callback_reply(req, -EINVAL);
+ ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+ &dlm_req->lock_handle[0]);
+ return 0;
+ }
+
+ if ((lock->l_flags & LDLM_FL_FAIL_LOC) &&
+ lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+ OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+ /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+ lock_res_and_lock(lock);
+ lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+ LDLM_AST_FLAGS);
+ if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+ /* If somebody cancels lock and cache is already dropped,
+ * or lock is failed before cp_ast received on client,
+ * we can tell the server we have no lock. Otherwise, we
+ * should send cancel after dropping the cache. */
+ if (((lock->l_flags & LDLM_FL_CANCELING) &&
+ (lock->l_flags & LDLM_FL_BL_DONE)) ||
+ (lock->l_flags & LDLM_FL_FAILED)) {
+ LDLM_DEBUG(lock, "callback on lock %#llx - lock disappeared\n",
+ dlm_req->lock_handle[0].cookie);
+ unlock_res_and_lock(lock);
+ LDLM_LOCK_RELEASE(lock);
+ rc = ldlm_callback_reply(req, -EINVAL);
+ ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+ &dlm_req->lock_handle[0]);
+ return 0;
+ }
+ /* BL_AST locks are not needed in LRU.
+ * Let ldlm_cancel_lru() be fast. */
+ ldlm_lock_remove_from_lru(lock);
+ lock->l_flags |= LDLM_FL_BL_AST;
+ }
+ unlock_res_and_lock(lock);
+
+ /* We want the ost thread to get this reply so that it can respond
+ * to ost requests (write cache writeback) that might be triggered
+ * in the callback.
+ *
+ * But we'd also like to be able to indicate in the reply that we're
+ * cancelling right now, because it's unused, or have an intent result
+ * in the reply, so we might have to push the responsibility for sending
+ * the reply down into the AST handlers, alas. */
+
+ switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+ case LDLM_BL_CALLBACK:
+ CDEBUG(D_INODE, "blocking ast\n");
+ req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+ if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
+ rc = ldlm_callback_reply(req, 0);
+ if (req->rq_no_reply || rc)
+ ldlm_callback_errmsg(req, "Normal process", rc,
+ &dlm_req->lock_handle[0]);
+ }
+ if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+ ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+ break;
+ case LDLM_CP_CALLBACK:
+ CDEBUG(D_INODE, "completion ast\n");
+ req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+ ldlm_callback_reply(req, 0);
+ ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+ break;
+ case LDLM_GL_CALLBACK:
+ CDEBUG(D_INODE, "glimpse ast\n");
+ req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+ ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+ break;
+ default:
+ LBUG(); /* checked above */
+ }
+
+ return 0;
+}
+
+
+static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp)
+{
+ struct ldlm_bl_work_item *blwi = NULL;
+ static unsigned int num_bl;
+
+ spin_lock(&blp->blp_lock);
+ /* process a request from the blp_list at least every blp_num_threads */
+ if (!list_empty(&blp->blp_list) &&
+ (list_empty(&blp->blp_prio_list) || num_bl == 0))
+ blwi = list_entry(blp->blp_list.next,
+ struct ldlm_bl_work_item, blwi_entry);
+ else
+ if (!list_empty(&blp->blp_prio_list))
+ blwi = list_entry(blp->blp_prio_list.next,
+ struct ldlm_bl_work_item,
+ blwi_entry);
+
+ if (blwi) {
+ if (++num_bl >= atomic_read(&blp->blp_num_threads))
+ num_bl = 0;
+ list_del(&blwi->blwi_entry);
+ }
+ spin_unlock(&blp->blp_lock);
+
+ return blwi;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+ char bltd_name[CFS_CURPROC_COMM_MAX];
+ struct ldlm_bl_pool *bltd_blp;
+ struct completion bltd_comp;
+ int bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
+{
+ struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+ struct task_struct *task;
+
+ init_completion(&bltd.bltd_comp);
+ bltd.bltd_num = atomic_read(&blp->blp_num_threads);
+ snprintf(bltd.bltd_name, sizeof(bltd.bltd_name),
+ "ldlm_bl_%02d", bltd.bltd_num);
+ task = kthread_run(ldlm_bl_thread_main, &bltd, "%s", bltd.bltd_name);
+ if (IS_ERR(task)) {
+ CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+ atomic_read(&blp->blp_num_threads), PTR_ERR(task));
+ return PTR_ERR(task);
+ }
+ wait_for_completion(&bltd.bltd_comp);
+
+ return 0;
+}
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+ struct ldlm_bl_pool *blp;
+
+ {
+ struct ldlm_bl_thread_data *bltd = arg;
+
+ blp = bltd->bltd_blp;
+
+ atomic_inc(&blp->blp_num_threads);
+ atomic_inc(&blp->blp_busy_threads);
+
+ complete(&bltd->bltd_comp);
+ /* cannot use bltd after this, it is only on caller's stack */
+ }
+
+ while (1) {
+ struct l_wait_info lwi = { 0 };
+ struct ldlm_bl_work_item *blwi = NULL;
+ int busy;
+
+ blwi = ldlm_bl_get_work(blp);
+
+ if (blwi == NULL) {
+ atomic_dec(&blp->blp_busy_threads);
+ l_wait_event_exclusive(blp->blp_waitq,
+ (blwi = ldlm_bl_get_work(blp)) != NULL,
+ &lwi);
+ busy = atomic_inc_return(&blp->blp_busy_threads);
+ } else {
+ busy = atomic_read(&blp->blp_busy_threads);
+ }
+
+ if (blwi->blwi_ns == NULL)
+ /* added by ldlm_cleanup() */
+ break;
+
+ /* Not fatal if racy and have a few too many threads */
+ if (unlikely(busy < blp->blp_max_threads &&
+ busy >= atomic_read(&blp->blp_num_threads) &&
+ !blwi->blwi_mem_pressure))
+ /* discard the return value, we tried */
+ ldlm_bl_thread_start(blp);
+
+ if (blwi->blwi_mem_pressure)
+ memory_pressure_set();
+
+ if (blwi->blwi_count) {
+ int count;
+ /* The special case when we cancel locks in LRU
+ * asynchronously, we pass the list of locks here.
+ * Thus locks are marked LDLM_FL_CANCELING, but NOT
+ * canceled locally yet. */
+ count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+ blwi->blwi_count,
+ LCF_BL_AST);
+ ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+ blwi->blwi_flags);
+ } else {
+ ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+ blwi->blwi_lock);
+ }
+ if (blwi->blwi_mem_pressure)
+ memory_pressure_clr();
+
+ if (blwi->blwi_flags & LCF_ASYNC)
+ OBD_FREE(blwi, sizeof(*blwi));
+ else
+ complete(&blwi->blwi_comp);
+ }
+
+ atomic_dec(&blp->blp_busy_threads);
+ atomic_dec(&blp->blp_num_threads);
+ complete(&blp->blp_comp);
+ return 0;
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+ int rc = 0;
+
+ mutex_lock(&ldlm_ref_mutex);
+ if (++ldlm_refcount == 1) {
+ rc = ldlm_setup();
+ if (rc)
+ ldlm_refcount--;
+ }
+ mutex_unlock(&ldlm_ref_mutex);
+
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_get_ref);
+
+void ldlm_put_ref(void)
+{
+ mutex_lock(&ldlm_ref_mutex);
+ if (ldlm_refcount == 1) {
+ int rc = ldlm_cleanup();
+
+ if (rc)
+ CERROR("ldlm_cleanup failed: %d\n", rc);
+ else
+ ldlm_refcount--;
+ } else {
+ ldlm_refcount--;
+ }
+ mutex_unlock(&ldlm_ref_mutex);
+}
+EXPORT_SYMBOL(ldlm_put_ref);
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+ struct ldlm_lock *lock;
+
+ lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+ return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+ struct ldlm_lock *lock;
+
+ lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+ lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+ return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ldlm_lock *lock;
+
+ lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+ LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ldlm_lock *lock;
+
+ lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+ LDLM_LOCK_RELEASE(lock);
+}
+
+static cfs_hash_ops_t ldlm_export_lock_ops = {
+ .hs_hash = ldlm_export_lock_hash,
+ .hs_key = ldlm_export_lock_key,
+ .hs_keycmp = ldlm_export_lock_keycmp,
+ .hs_keycpy = ldlm_export_lock_keycpy,
+ .hs_object = ldlm_export_lock_object,
+ .hs_get = ldlm_export_lock_get,
+ .hs_put = ldlm_export_lock_put,
+ .hs_put_locked = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+ int rc;
+
+ exp->exp_lock_hash =
+ cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+ HASH_EXP_LOCK_CUR_BITS,
+ HASH_EXP_LOCK_MAX_BITS,
+ HASH_EXP_LOCK_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+ &ldlm_export_lock_ops,
+ CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+ CFS_HASH_NBLK_CHANGE);
+
+ if (!exp->exp_lock_hash)
+ return -ENOMEM;
+
+ rc = ldlm_init_flock_export(exp);
+ if (rc)
+ goto err;
+
+ return 0;
+err:
+ ldlm_destroy_export(exp);
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+ cfs_hash_putref(exp->exp_lock_hash);
+ exp->exp_lock_hash = NULL;
+
+ ldlm_destroy_flock_export(exp);
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static int ldlm_setup(void)
+{
+ static struct ptlrpc_service_conf conf;
+ struct ldlm_bl_pool *blp = NULL;
+ int rc = 0;
+ int i;
+
+ if (ldlm_state != NULL)
+ return -EALREADY;
+
+ OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+ if (ldlm_state == NULL)
+ return -ENOMEM;
+
+ rc = ldlm_proc_setup();
+ if (rc != 0)
+ goto out;
+
+ memset(&conf, 0, sizeof(conf));
+ conf = (typeof(conf)) {
+ .psc_name = "ldlm_cbd",
+ .psc_watchdog_factor = 2,
+ .psc_buf = {
+ .bc_nbufs = LDLM_CLIENT_NBUFS,
+ .bc_buf_size = LDLM_BUFSIZE,
+ .bc_req_max_size = LDLM_MAXREQSIZE,
+ .bc_rep_max_size = LDLM_MAXREPSIZE,
+ .bc_req_portal = LDLM_CB_REQUEST_PORTAL,
+ .bc_rep_portal = LDLM_CB_REPLY_PORTAL,
+ },
+ .psc_thr = {
+ .tc_thr_name = "ldlm_cb",
+ .tc_thr_factor = LDLM_THR_FACTOR,
+ .tc_nthrs_init = LDLM_NTHRS_INIT,
+ .tc_nthrs_base = LDLM_NTHRS_BASE,
+ .tc_nthrs_max = LDLM_NTHRS_MAX,
+ .tc_nthrs_user = ldlm_num_threads,
+ .tc_cpu_affinity = 1,
+ .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD,
+ },
+ .psc_cpt = {
+ .cc_pattern = ldlm_cpts,
+ },
+ .psc_ops = {
+ .so_req_handler = ldlm_callback_handler,
+ },
+ };
+ ldlm_state->ldlm_cb_service =
+ ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+ if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+ CERROR("failed to start service\n");
+ rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+ ldlm_state->ldlm_cb_service = NULL;
+ goto out;
+ }
+
+
+ OBD_ALLOC(blp, sizeof(*blp));
+ if (blp == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ldlm_state->ldlm_bl_pool = blp;
+
+ spin_lock_init(&blp->blp_lock);
+ INIT_LIST_HEAD(&blp->blp_list);
+ INIT_LIST_HEAD(&blp->blp_prio_list);
+ init_waitqueue_head(&blp->blp_waitq);
+ atomic_set(&blp->blp_num_threads, 0);
+ atomic_set(&blp->blp_busy_threads, 0);
+
+ if (ldlm_num_threads == 0) {
+ blp->blp_min_threads = LDLM_NTHRS_INIT;
+ blp->blp_max_threads = LDLM_NTHRS_MAX;
+ } else {
+ blp->blp_min_threads = blp->blp_max_threads =
+ min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+ ldlm_num_threads));
+ }
+
+ for (i = 0; i < blp->blp_min_threads; i++) {
+ rc = ldlm_bl_thread_start(blp);
+ if (rc < 0)
+ goto out;
+ }
+
+
+ rc = ldlm_pools_init();
+ if (rc) {
+ CERROR("Failed to initialize LDLM pools: %d\n", rc);
+ goto out;
+ }
+ return 0;
+
+ out:
+ ldlm_cleanup();
+ return rc;
+}
+
+static int ldlm_cleanup(void)
+{
+ if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+ !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+ CERROR("ldlm still has namespaces; clean these up first.\n");
+ ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+ ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+ return -EBUSY;
+ }
+
+ ldlm_pools_fini();
+
+ if (ldlm_state->ldlm_bl_pool != NULL) {
+ struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+ while (atomic_read(&blp->blp_num_threads) > 0) {
+ struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+ init_completion(&blp->blp_comp);
+
+ spin_lock(&blp->blp_lock);
+ list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+ wake_up(&blp->blp_waitq);
+ spin_unlock(&blp->blp_lock);
+
+ wait_for_completion(&blp->blp_comp);
+ }
+
+ OBD_FREE(blp, sizeof(*blp));
+ }
+
+ if (ldlm_state->ldlm_cb_service != NULL)
+ ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+
+ ldlm_proc_cleanup();
+
+
+ OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+ ldlm_state = NULL;
+
+ return 0;
+}
+
+int ldlm_init(void)
+{
+ mutex_init(&ldlm_ref_mutex);
+ mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+ mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+ ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+ sizeof(struct ldlm_resource), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (ldlm_resource_slab == NULL)
+ return -ENOMEM;
+
+ ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+ sizeof(struct ldlm_lock), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+ if (ldlm_lock_slab == NULL) {
+ kmem_cache_destroy(ldlm_resource_slab);
+ return -ENOMEM;
+ }
+
+ ldlm_interval_slab = kmem_cache_create("interval_node",
+ sizeof(struct ldlm_interval),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (ldlm_interval_slab == NULL) {
+ kmem_cache_destroy(ldlm_resource_slab);
+ kmem_cache_destroy(ldlm_lock_slab);
+ return -ENOMEM;
+ }
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+ return 0;
+}
+
+void ldlm_exit(void)
+{
+ if (ldlm_refcount)
+ CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+ kmem_cache_destroy(ldlm_resource_slab);
+ /* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+ * synchronize_rcu() to wait a grace period elapsed, so that
+ * ldlm_lock_free() get a chance to be called. */
+ synchronize_rcu();
+ kmem_cache_destroy(ldlm_lock_slab);
+ kmem_cache_destroy(ldlm_interval_slab);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
new file mode 100644
index 000000000..a1fe2c161
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently. Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+
+#include "ldlm_internal.h"
+
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+ ldlm_policy_data_t *lpolicy)
+{
+ /* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+ ldlm_wire_policy_data_t *wpolicy)
+{
+ /* No policy for plain locks */
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
new file mode 100644
index 000000000..a9f4833e0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
@@ -0,0 +1,1455 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much
+ * sensitive client should be about last SLV from server. The higher LVF is the
+ * more locks will be canceled on client. Default value for it is 1. Setting LVF
+ * to 2 means that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ * chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com).
+ * Andreas Dilger (adilger@clusterfs.com) proposed few nice ideas like using
+ * LVF and many cleanups. Flow definition to allow more easy understanding of
+ * the logic belongs to Nikita Danilov (nikita@clusterfs.com) as well as many
+ * cleanups and fixes. And design and implementation are done by Yury Umanets
+ * (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via proc. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via proc tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/cl_object.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "ldlm_internal.h"
+
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L) (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+ return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+ /*
+ * Allow to have all locks for 1 client for 10 hrs.
+ * Formula is the following: limit * 10h / 1 client.
+ */
+ __u64 lim = (__u64)L * LDLM_POOL_MAX_AGE / 1;
+ return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+ return 1;
+}
+
+enum {
+ LDLM_POOL_FIRST_STAT = 0,
+ LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+ LDLM_POOL_GRANT_STAT,
+ LDLM_POOL_CANCEL_STAT,
+ LDLM_POOL_GRANT_RATE_STAT,
+ LDLM_POOL_CANCEL_RATE_STAT,
+ LDLM_POOL_GRANT_PLAN_STAT,
+ LDLM_POOL_SLV_STAT,
+ LDLM_POOL_SHRINK_REQTD_STAT,
+ LDLM_POOL_SHRINK_FREED_STAT,
+ LDLM_POOL_RECALC_STAT,
+ LDLM_POOL_TIMING_STAT,
+ LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+ return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+ /*
+ * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+ * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+ *
+ * How this will affect execution is the following:
+ *
+ * - for thread period 1s we will have grant_step 1% which good from
+ * pov of taking some load off from server and push it out to clients.
+ * This is like that because 1% for grant_step means that server will
+ * not allow clients to get lots of locks in short period of time and
+ * keep all old locks in their caches. Clients will always have to
+ * get some locks back if they want to take some new;
+ *
+ * - for thread period 10s (which is default) we will have 23% which
+ * means that clients will have enough of room to take some new locks
+ * without getting some back. All locks from this 23% which were not
+ * taken by clients in current period will contribute in SLV growing.
+ * SLV growing means more locks cached on clients until limit or grant
+ * plan is reached.
+ */
+ return LDLM_POOL_MAX_GSP -
+ ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+ (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+ int granted, grant_step, limit;
+
+ limit = ldlm_pool_get_limit(pl);
+ granted = atomic_read(&pl->pl_granted);
+
+ grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+ grant_step = ((limit - granted) * grant_step) / 100;
+ pl->pl_grant_plan = granted + grant_step;
+ limit = (limit * 5) >> 2;
+ if (pl->pl_grant_plan > limit)
+ pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+ int granted;
+ int grant_plan;
+ int round_up;
+ __u64 slv;
+ __u64 slv_factor;
+ __u64 grant_usage;
+ __u32 limit;
+
+ slv = pl->pl_server_lock_volume;
+ grant_plan = pl->pl_grant_plan;
+ limit = ldlm_pool_get_limit(pl);
+ granted = atomic_read(&pl->pl_granted);
+ round_up = granted < limit;
+
+ grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+ /*
+ * Find out SLV change factor which is the ratio of grant usage
+ * from limit. SLV changes as fast as the ratio of grant plan
+ * consumption. The more locks from grant plan are not consumed
+ * by clients in last interval (idle time), the faster grows
+ * SLV. And the opposite, the more grant plan is over-consumed
+ * (load time) the faster drops SLV.
+ */
+ slv_factor = grant_usage << LDLM_POOL_SLV_SHIFT;
+ do_div(slv_factor, limit);
+ slv = slv * slv_factor;
+ slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+ if (slv > ldlm_pool_slv_max(limit))
+ slv = ldlm_pool_slv_max(limit);
+ else if (slv < ldlm_pool_slv_min(limit))
+ slv = ldlm_pool_slv_min(limit);
+
+ pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
+{
+ int grant_plan = pl->pl_grant_plan;
+ __u64 slv = pl->pl_server_lock_volume;
+ int granted = atomic_read(&pl->pl_granted);
+ int grant_rate = atomic_read(&pl->pl_grant_rate);
+ int cancel_rate = atomic_read(&pl->pl_cancel_rate);
+
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+ slv);
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+ granted);
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+ grant_rate);
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+ grant_plan);
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+ cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+ struct obd_device *obd;
+
+ /*
+ * Set new SLV in obd field for using it later without accessing the
+ * pool. This is required to avoid race between sending reply to client
+ * with new SLV and cleanup server stack in which we can't guarantee
+ * that namespace is still alive. We know only that obd is alive as
+ * long as valid export is alive.
+ */
+ obd = ldlm_pl2ns(pl)->ns_obd;
+ LASSERT(obd != NULL);
+ write_lock(&obd->obd_pool_lock);
+ obd->obd_pool_slv = pl->pl_server_lock_volume;
+ write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
+{
+ time_t recalc_interval_sec;
+
+ recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+ if (recalc_interval_sec < pl->pl_recalc_period)
+ return 0;
+
+ spin_lock(&pl->pl_lock);
+ recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+ if (recalc_interval_sec < pl->pl_recalc_period) {
+ spin_unlock(&pl->pl_lock);
+ return 0;
+ }
+ /*
+ * Recalc SLV after last period. This should be done
+ * _before_ recalculating new grant plan.
+ */
+ ldlm_pool_recalc_slv(pl);
+
+ /*
+ * Make sure that pool informed obd of last SLV changes.
+ */
+ ldlm_srv_pool_push_slv(pl);
+
+ /*
+ * Update grant_plan for new period.
+ */
+ ldlm_pool_recalc_grant_plan(pl);
+
+ pl->pl_recalc_time = get_seconds();
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+ recalc_interval_sec);
+ spin_unlock(&pl->pl_lock);
+ return 0;
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+ int nr, gfp_t gfp_mask)
+{
+ __u32 limit;
+
+ /*
+ * VM is asking how many entries may be potentially freed.
+ */
+ if (nr == 0)
+ return atomic_read(&pl->pl_granted);
+
+ /*
+ * Client already canceled locks but server is already in shrinker
+ * and can't cancel anything. Let's catch this race.
+ */
+ if (atomic_read(&pl->pl_granted) == 0)
+ return 0;
+
+ spin_lock(&pl->pl_lock);
+
+ /*
+ * We want shrinker to possibly cause cancellation of @nr locks from
+ * clients or grant approximately @nr locks smaller next intervals.
+ *
+ * This is why we decreased SLV by @nr. This effect will only be as
+ * long as one re-calc interval (1s these days) and this should be
+ * enough to pass this decreased SLV to all clients. On next recalc
+ * interval pool will either increase SLV if locks load is not high
+ * or will keep on same level or even decrease again, thus, shrinker
+ * decreased SLV will affect next recalc intervals and this way will
+ * make locking load lower.
+ */
+ if (nr < pl->pl_server_lock_volume) {
+ pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+ } else {
+ limit = ldlm_pool_get_limit(pl);
+ pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+ }
+
+ /*
+ * Make sure that pool informed obd of last SLV changes.
+ */
+ ldlm_srv_pool_push_slv(pl);
+ spin_unlock(&pl->pl_lock);
+
+ /*
+ * We did not really free any memory here so far, it only will be
+ * freed later may be, so that we return 0 to not confuse VM.
+ */
+ return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+ struct obd_device *obd;
+
+ obd = ldlm_pl2ns(pl)->ns_obd;
+ LASSERT(obd != NULL && obd != LP_POISON);
+ LASSERT(obd->obd_type != LP_POISON);
+ write_lock(&obd->obd_pool_lock);
+ obd->obd_pool_limit = limit;
+ write_unlock(&obd->obd_pool_lock);
+
+ ldlm_pool_set_limit(pl, limit);
+ return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+ struct obd_device *obd;
+
+ /*
+ * Get new SLV and Limit from obd which is updated with coming
+ * RPCs.
+ */
+ obd = ldlm_pl2ns(pl)->ns_obd;
+ LASSERT(obd != NULL);
+ read_lock(&obd->obd_pool_lock);
+ pl->pl_server_lock_volume = obd->obd_pool_slv;
+ ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+ read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
+{
+ time_t recalc_interval_sec;
+ int ret;
+
+ recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+ if (recalc_interval_sec < pl->pl_recalc_period)
+ return 0;
+
+ spin_lock(&pl->pl_lock);
+ /*
+ * Check if we need to recalc lists now.
+ */
+ recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+ if (recalc_interval_sec < pl->pl_recalc_period) {
+ spin_unlock(&pl->pl_lock);
+ return 0;
+ }
+
+ /*
+ * Make sure that pool knows last SLV and Limit from obd.
+ */
+ ldlm_cli_pool_pop_slv(pl);
+
+ spin_unlock(&pl->pl_lock);
+
+ /*
+ * Do not cancel locks in case lru resize is disabled for this ns.
+ */
+ if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * In the time of canceling locks on client we do not need to maintain
+ * sharp timing, we only want to cancel locks asap according to new SLV.
+ * It may be called when SLV has changed much, this is why we do not
+ * take into account pl->pl_recalc_time here.
+ */
+ ret = ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, LDLM_CANCEL_LRUR);
+
+out:
+ spin_lock(&pl->pl_lock);
+ /*
+ * Time of LRU resizing might be longer than period,
+ * so update after LRU resizing rather than before it.
+ */
+ pl->pl_recalc_time = get_seconds();
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+ recalc_interval_sec);
+ spin_unlock(&pl->pl_lock);
+ return ret;
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side. Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+ int nr, gfp_t gfp_mask)
+{
+ struct ldlm_namespace *ns;
+ int unused;
+
+ ns = ldlm_pl2ns(pl);
+
+ /*
+ * Do not cancel locks in case lru resize is disabled for this ns.
+ */
+ if (!ns_connect_lru_resize(ns))
+ return 0;
+
+ /*
+ * Make sure that pool knows last SLV and Limit from obd.
+ */
+ ldlm_cli_pool_pop_slv(pl);
+
+ spin_lock(&ns->ns_lock);
+ unused = ns->ns_nr_unused;
+ spin_unlock(&ns->ns_lock);
+
+ if (nr == 0)
+ return (unused / 100) * sysctl_vfs_cache_pressure;
+ else
+ return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_CANCEL_SHRINK);
+}
+
+static const struct ldlm_pool_ops ldlm_srv_pool_ops = {
+ .po_recalc = ldlm_srv_pool_recalc,
+ .po_shrink = ldlm_srv_pool_shrink,
+ .po_setup = ldlm_srv_pool_setup
+};
+
+static const struct ldlm_pool_ops ldlm_cli_pool_ops = {
+ .po_recalc = ldlm_cli_pool_recalc,
+ .po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+ time_t recalc_interval_sec;
+ int count;
+
+ recalc_interval_sec = get_seconds() - pl->pl_recalc_time;
+ if (recalc_interval_sec <= 0)
+ goto recalc;
+
+ spin_lock(&pl->pl_lock);
+ if (recalc_interval_sec > 0) {
+ /*
+ * Update pool statistics every 1s.
+ */
+ ldlm_pool_recalc_stats(pl);
+
+ /*
+ * Zero out all rates and speed for the last period.
+ */
+ atomic_set(&pl->pl_grant_rate, 0);
+ atomic_set(&pl->pl_cancel_rate, 0);
+ }
+ spin_unlock(&pl->pl_lock);
+
+ recalc:
+ if (pl->pl_ops->po_recalc != NULL) {
+ count = pl->pl_ops->po_recalc(pl);
+ lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+ count);
+ }
+ recalc_interval_sec = pl->pl_recalc_time - get_seconds() +
+ pl->pl_recalc_period;
+ if (recalc_interval_sec <= 0) {
+ /* Prevent too frequent recalculation. */
+ CDEBUG(D_DLMTRACE, "Negative interval(%ld), "
+ "too short period(%ld)",
+ recalc_interval_sec,
+ pl->pl_recalc_period);
+ recalc_interval_sec = 1;
+ }
+
+ return recalc_interval_sec;
+}
+
+/*
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool pl is used. When nr == 0, just return the number of
+ * freeable locks. Otherwise, return the number of canceled locks.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+ gfp_t gfp_mask)
+{
+ int cancel = 0;
+
+ if (pl->pl_ops->po_shrink != NULL) {
+ cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+ if (nr > 0) {
+ lprocfs_counter_add(pl->pl_stats,
+ LDLM_POOL_SHRINK_REQTD_STAT,
+ nr);
+ lprocfs_counter_add(pl->pl_stats,
+ LDLM_POOL_SHRINK_FREED_STAT,
+ cancel);
+ CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, shrunk %d\n",
+ pl->pl_name, nr, cancel);
+ }
+ }
+ return cancel;
+}
+EXPORT_SYMBOL(ldlm_pool_shrink);
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+ if (pl->pl_ops->po_setup != NULL)
+ return pl->pl_ops->po_setup(pl, limit);
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_setup);
+
+#if defined(CONFIG_PROC_FS)
+static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
+{
+ int granted, grant_rate, cancel_rate, grant_step;
+ int grant_speed, grant_plan, lvf;
+ struct ldlm_pool *pl = m->private;
+ __u64 slv, clv;
+ __u32 limit;
+
+ spin_lock(&pl->pl_lock);
+ slv = pl->pl_server_lock_volume;
+ clv = pl->pl_client_lock_volume;
+ limit = ldlm_pool_get_limit(pl);
+ grant_plan = pl->pl_grant_plan;
+ granted = atomic_read(&pl->pl_granted);
+ grant_rate = atomic_read(&pl->pl_grant_rate);
+ cancel_rate = atomic_read(&pl->pl_cancel_rate);
+ grant_speed = grant_rate - cancel_rate;
+ lvf = atomic_read(&pl->pl_lock_volume_factor);
+ grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+ spin_unlock(&pl->pl_lock);
+
+ seq_printf(m, "LDLM pool state (%s):\n"
+ " SLV: %llu\n"
+ " CLV: %llu\n"
+ " LVF: %d\n",
+ pl->pl_name, slv, clv, lvf);
+
+ if (ns_is_server(ldlm_pl2ns(pl))) {
+ seq_printf(m, " GSP: %d%%\n"
+ " GP: %d\n",
+ grant_step, grant_plan);
+ }
+ seq_printf(m, " GR: %d\n CR: %d\n GS: %d\n"
+ " G: %d\n L: %d\n",
+ grant_rate, cancel_rate, grant_speed,
+ granted, limit);
+
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
+
+static int lprocfs_grant_speed_seq_show(struct seq_file *m, void *unused)
+{
+ struct ldlm_pool *pl = m->private;
+ int grant_speed;
+
+ spin_lock(&pl->pl_lock);
+ /* serialize with ldlm_pool_recalc */
+ grant_speed = atomic_read(&pl->pl_grant_rate) -
+ atomic_read(&pl->pl_cancel_rate);
+ spin_unlock(&pl->pl_lock);
+ return lprocfs_rd_uint(m, &grant_speed);
+}
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(grant_plan, int);
+LPROC_SEQ_FOPS_RO(lprocfs_grant_plan);
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(recalc_period, int);
+LDLM_POOL_PROC_WRITER(recalc_period, int);
+static ssize_t lprocfs_recalc_period_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+
+ return lprocfs_wr_recalc_period(file, buf, len, seq->private);
+}
+LPROC_SEQ_FOPS(lprocfs_recalc_period);
+
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, u64);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, atomic);
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_pool_rw, atomic);
+
+LPROC_SEQ_FOPS_RO(lprocfs_grant_speed);
+
+#define LDLM_POOL_ADD_VAR(name, var, ops) \
+ do { \
+ snprintf(var_name, MAX_STRING_SIZE, #name); \
+ pool_vars[0].data = var; \
+ pool_vars[0].fops = ops; \
+ lprocfs_add_vars(pl->pl_proc_dir, pool_vars, NULL);\
+ } while (0)
+
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+ struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+ struct proc_dir_entry *parent_ns_proc;
+ struct lprocfs_vars pool_vars[2];
+ char *var_name = NULL;
+ int rc = 0;
+
+ OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
+ if (!var_name)
+ return -ENOMEM;
+
+ parent_ns_proc = ns->ns_proc_dir_entry;
+ if (parent_ns_proc == NULL) {
+ CERROR("%s: proc entry is not initialized\n",
+ ldlm_ns_name(ns));
+ rc = -EINVAL;
+ goto out_free_name;
+ }
+ pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+ NULL, NULL);
+ if (IS_ERR(pl->pl_proc_dir)) {
+ CERROR("LProcFS failed in ldlm-pool-init\n");
+ rc = PTR_ERR(pl->pl_proc_dir);
+ pl->pl_proc_dir = NULL;
+ goto out_free_name;
+ }
+
+ var_name[MAX_STRING_SIZE] = '\0';
+ memset(pool_vars, 0, sizeof(pool_vars));
+ pool_vars[0].name = var_name;
+
+ LDLM_POOL_ADD_VAR("server_lock_volume", &pl->pl_server_lock_volume,
+ &ldlm_pool_u64_fops);
+ LDLM_POOL_ADD_VAR("limit", &pl->pl_limit, &ldlm_pool_rw_atomic_fops);
+ LDLM_POOL_ADD_VAR("granted", &pl->pl_granted, &ldlm_pool_atomic_fops);
+ LDLM_POOL_ADD_VAR("grant_speed", pl, &lprocfs_grant_speed_fops);
+ LDLM_POOL_ADD_VAR("cancel_rate", &pl->pl_cancel_rate,
+ &ldlm_pool_atomic_fops);
+ LDLM_POOL_ADD_VAR("grant_rate", &pl->pl_grant_rate,
+ &ldlm_pool_atomic_fops);
+ LDLM_POOL_ADD_VAR("grant_plan", pl, &lprocfs_grant_plan_fops);
+ LDLM_POOL_ADD_VAR("recalc_period", pl, &lprocfs_recalc_period_fops);
+ LDLM_POOL_ADD_VAR("lock_volume_factor", &pl->pl_lock_volume_factor,
+ &ldlm_pool_rw_atomic_fops);
+ LDLM_POOL_ADD_VAR("state", pl, &lprocfs_pool_state_fops);
+
+ pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+ LDLM_POOL_FIRST_STAT, 0);
+ if (!pl->pl_stats) {
+ rc = -ENOMEM;
+ goto out_free_name;
+ }
+
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "granted", "locks");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "grant", "locks");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "cancel", "locks");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "grant_rate", "locks/s");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "cancel_rate", "locks/s");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "grant_plan", "locks/s");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "slv", "slv");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "shrink_request", "locks");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "shrink_freed", "locks");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "recalc_freed", "locks");
+ lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+ LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+ "recalc_timing", "sec");
+ rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+
+out_free_name:
+ OBD_FREE(var_name, MAX_STRING_SIZE + 1);
+ return rc;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+{
+ if (pl->pl_stats != NULL) {
+ lprocfs_free_stats(&pl->pl_stats);
+ pl->pl_stats = NULL;
+ }
+ if (pl->pl_proc_dir != NULL) {
+ lprocfs_remove(&pl->pl_proc_dir);
+ pl->pl_proc_dir = NULL;
+ }
+}
+#else /* !CONFIG_PROC_FS */
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+ return 0;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl) {}
+#endif /* CONFIG_PROC_FS */
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+ int idx, ldlm_side_t client)
+{
+ int rc;
+
+ spin_lock_init(&pl->pl_lock);
+ atomic_set(&pl->pl_granted, 0);
+ pl->pl_recalc_time = get_seconds();
+ atomic_set(&pl->pl_lock_volume_factor, 1);
+
+ atomic_set(&pl->pl_grant_rate, 0);
+ atomic_set(&pl->pl_cancel_rate, 0);
+ pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+ snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+ ldlm_ns_name(ns), idx);
+
+ if (client == LDLM_NAMESPACE_SERVER) {
+ pl->pl_ops = &ldlm_srv_pool_ops;
+ ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+ pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+ pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+ } else {
+ ldlm_pool_set_limit(pl, 1);
+ pl->pl_server_lock_volume = 0;
+ pl->pl_ops = &ldlm_cli_pool_ops;
+ pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+ }
+ pl->pl_client_lock_volume = 0;
+ rc = ldlm_pool_proc_init(pl);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_pool_init);
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+ ldlm_pool_proc_fini(pl);
+
+ /*
+ * Pool should not be used after this point. We can't free it here as
+ * it lives in struct ldlm_namespace, but still interested in catching
+ * any abnormal using cases.
+ */
+ POISON(pl, 0x5a, sizeof(*pl));
+}
+EXPORT_SYMBOL(ldlm_pool_fini);
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+ /*
+ * FLOCK locks are special in a sense that they are almost never
+ * cancelled, instead special kind of lock is used to drop them.
+ * also there is no LRU for flock locks, so no point in tracking
+ * them anyway.
+ */
+ if (lock->l_resource->lr_type == LDLM_FLOCK)
+ return;
+
+ atomic_inc(&pl->pl_granted);
+ atomic_inc(&pl->pl_grant_rate);
+ lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+ /*
+ * Do not do pool recalc for client side as all locks which
+ * potentially may be canceled has already been packed into
+ * enqueue/cancel rpc. Also we do not want to run out of stack
+ * with too long call paths.
+ */
+ if (ns_is_server(ldlm_pl2ns(pl)))
+ ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_add);
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+ /*
+ * Filter out FLOCK locks. Read above comment in ldlm_pool_add().
+ */
+ if (lock->l_resource->lr_type == LDLM_FLOCK)
+ return;
+
+ LASSERT(atomic_read(&pl->pl_granted) > 0);
+ atomic_dec(&pl->pl_granted);
+ atomic_inc(&pl->pl_cancel_rate);
+
+ lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+ if (ns_is_server(ldlm_pl2ns(pl)))
+ ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_del);
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+ __u64 slv;
+
+ spin_lock(&pl->pl_lock);
+ slv = pl->pl_server_lock_volume;
+ spin_unlock(&pl->pl_lock);
+ return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_slv);
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+ spin_lock(&pl->pl_lock);
+ pl->pl_server_lock_volume = slv;
+ spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_slv);
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+ __u64 slv;
+
+ spin_lock(&pl->pl_lock);
+ slv = pl->pl_client_lock_volume;
+ spin_unlock(&pl->pl_lock);
+ return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_clv);
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+ spin_lock(&pl->pl_lock);
+ pl->pl_client_lock_volume = clv;
+ spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_clv);
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+ return atomic_read(&pl->pl_limit);
+}
+EXPORT_SYMBOL(ldlm_pool_get_limit);
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+ atomic_set(&pl->pl_limit, limit);
+}
+EXPORT_SYMBOL(ldlm_pool_set_limit);
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+ return atomic_read(&pl->pl_lock_volume_factor);
+}
+EXPORT_SYMBOL(ldlm_pool_get_lvf);
+
+static int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+ return atomic_read(&pl->pl_granted);
+}
+
+static struct ptlrpc_thread *ldlm_pools_thread;
+static struct completion ldlm_pools_comp;
+
+/*
+ * count locks from all namespaces (if possible). Returns number of
+ * cached locks.
+ */
+static unsigned long ldlm_pools_count(ldlm_side_t client, gfp_t gfp_mask)
+{
+ int total = 0, nr_ns;
+ struct ldlm_namespace *ns;
+ struct ldlm_namespace *ns_old = NULL; /* loop detection */
+ void *cookie;
+
+ if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+ return 0;
+
+ CDEBUG(D_DLMTRACE, "Request to count %s locks from all pools\n",
+ client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+ cookie = cl_env_reenter();
+
+ /*
+ * Find out how many resources we may release.
+ */
+ for (nr_ns = ldlm_namespace_nr_read(client);
+ nr_ns > 0; nr_ns--) {
+ mutex_lock(ldlm_namespace_lock(client));
+ if (list_empty(ldlm_namespace_list(client))) {
+ mutex_unlock(ldlm_namespace_lock(client));
+ cl_env_reexit(cookie);
+ return 0;
+ }
+ ns = ldlm_namespace_first_locked(client);
+
+ if (ns == ns_old) {
+ mutex_unlock(ldlm_namespace_lock(client));
+ break;
+ }
+
+ if (ldlm_ns_empty(ns)) {
+ ldlm_namespace_move_to_inactive_locked(ns, client);
+ mutex_unlock(ldlm_namespace_lock(client));
+ continue;
+ }
+
+ if (ns_old == NULL)
+ ns_old = ns;
+
+ ldlm_namespace_get(ns);
+ ldlm_namespace_move_to_active_locked(ns, client);
+ mutex_unlock(ldlm_namespace_lock(client));
+ total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+ ldlm_namespace_put(ns);
+ }
+
+ cl_env_reexit(cookie);
+ return total;
+}
+
+static unsigned long ldlm_pools_scan(ldlm_side_t client, int nr, gfp_t gfp_mask)
+{
+ unsigned long freed = 0;
+ int tmp, nr_ns;
+ struct ldlm_namespace *ns;
+ void *cookie;
+
+ if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS))
+ return -1;
+
+ cookie = cl_env_reenter();
+
+ /*
+ * Shrink at least ldlm_namespace_nr_read(client) namespaces.
+ */
+ for (tmp = nr_ns = ldlm_namespace_nr_read(client);
+ tmp > 0; tmp--) {
+ int cancel, nr_locks;
+
+ /*
+ * Do not call shrink under ldlm_namespace_lock(client)
+ */
+ mutex_lock(ldlm_namespace_lock(client));
+ if (list_empty(ldlm_namespace_list(client))) {
+ mutex_unlock(ldlm_namespace_lock(client));
+ break;
+ }
+ ns = ldlm_namespace_first_locked(client);
+ ldlm_namespace_get(ns);
+ ldlm_namespace_move_to_active_locked(ns, client);
+ mutex_unlock(ldlm_namespace_lock(client));
+
+ nr_locks = ldlm_pool_granted(&ns->ns_pool);
+ /*
+ * We use to shrink propotionally but with new shrinker API,
+ * we lost the total number of freeable locks.
+ */
+ cancel = 1 + min_t(int, nr_locks, nr / nr_ns);
+ freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+ ldlm_namespace_put(ns);
+ }
+ cl_env_reexit(cookie);
+ /*
+ * we only decrease the SLV in server pools shrinker, return
+ * SHRINK_STOP to kernel to avoid needless loop. LU-1128
+ */
+ return (client == LDLM_NAMESPACE_SERVER) ? SHRINK_STOP : freed;
+}
+
+static unsigned long ldlm_pools_srv_count(struct shrinker *s,
+ struct shrink_control *sc)
+{
+ return ldlm_pools_count(LDLM_NAMESPACE_SERVER, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_srv_scan(struct shrinker *s,
+ struct shrink_control *sc)
+{
+ return ldlm_pools_scan(LDLM_NAMESPACE_SERVER, sc->nr_to_scan,
+ sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_count(struct shrinker *s,
+ struct shrink_control *sc)
+{
+ return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask);
+}
+
+static unsigned long ldlm_pools_cli_scan(struct shrinker *s,
+ struct shrink_control *sc)
+{
+ return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan,
+ sc->gfp_mask);
+}
+
+int ldlm_pools_recalc(ldlm_side_t client)
+{
+ __u32 nr_l = 0, nr_p = 0, l;
+ struct ldlm_namespace *ns;
+ struct ldlm_namespace *ns_old = NULL;
+ int nr, equal = 0;
+ int time = 50; /* seconds of sleep if no active namespaces */
+
+ /*
+ * No need to setup pool limit for client pools.
+ */
+ if (client == LDLM_NAMESPACE_SERVER) {
+ /*
+ * Check all modest namespaces first.
+ */
+ mutex_lock(ldlm_namespace_lock(client));
+ list_for_each_entry(ns, ldlm_namespace_list(client),
+ ns_list_chain) {
+ if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+ continue;
+
+ l = ldlm_pool_granted(&ns->ns_pool);
+ if (l == 0)
+ l = 1;
+
+ /*
+ * Set the modest pools limit equal to their avg granted
+ * locks + ~6%.
+ */
+ l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+ ldlm_pool_setup(&ns->ns_pool, l);
+ nr_l += l;
+ nr_p++;
+ }
+
+ /*
+ * Make sure that modest namespaces did not eat more that 2/3
+ * of limit.
+ */
+ if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+ CWARN("\"Modest\" pools eat out 2/3 of server locks limit (%d of %lu). This means that you have too many clients for this amount of server RAM. Upgrade server!\n",
+ nr_l, LDLM_POOL_HOST_L);
+ equal = 1;
+ }
+
+ /*
+ * The rest is given to greedy namespaces.
+ */
+ list_for_each_entry(ns, ldlm_namespace_list(client),
+ ns_list_chain) {
+ if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+ continue;
+
+ if (equal) {
+ /*
+ * In the case 2/3 locks are eaten out by
+ * modest pools, we re-setup equal limit
+ * for _all_ pools.
+ */
+ l = LDLM_POOL_HOST_L /
+ ldlm_namespace_nr_read(client);
+ } else {
+ /*
+ * All the rest of greedy pools will have
+ * all locks in equal parts.
+ */
+ l = (LDLM_POOL_HOST_L - nr_l) /
+ (ldlm_namespace_nr_read(client) -
+ nr_p);
+ }
+ ldlm_pool_setup(&ns->ns_pool, l);
+ }
+ mutex_unlock(ldlm_namespace_lock(client));
+ }
+
+ /*
+ * Recalc at least ldlm_namespace_nr_read(client) namespaces.
+ */
+ for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) {
+ int skip;
+ /*
+ * Lock the list, get first @ns in the list, getref, move it
+ * to the tail, unlock and call pool recalc. This way we avoid
+ * calling recalc under @ns lock what is really good as we get
+ * rid of potential deadlock on client nodes when canceling
+ * locks synchronously.
+ */
+ mutex_lock(ldlm_namespace_lock(client));
+ if (list_empty(ldlm_namespace_list(client))) {
+ mutex_unlock(ldlm_namespace_lock(client));
+ break;
+ }
+ ns = ldlm_namespace_first_locked(client);
+
+ if (ns_old == ns) { /* Full pass complete */
+ mutex_unlock(ldlm_namespace_lock(client));
+ break;
+ }
+
+ /* We got an empty namespace, need to move it back to inactive
+ * list.
+ * The race with parallel resource creation is fine:
+ * - If they do namespace_get before our check, we fail the
+ * check and they move this item to the end of the list anyway
+ * - If we do the check and then they do namespace_get, then
+ * we move the namespace to inactive and they will move
+ * it back to active (synchronised by the lock, so no clash
+ * there).
+ */
+ if (ldlm_ns_empty(ns)) {
+ ldlm_namespace_move_to_inactive_locked(ns, client);
+ mutex_unlock(ldlm_namespace_lock(client));
+ continue;
+ }
+
+ if (ns_old == NULL)
+ ns_old = ns;
+
+ spin_lock(&ns->ns_lock);
+ /*
+ * skip ns which is being freed, and we don't want to increase
+ * its refcount again, not even temporarily. bz21519 & LU-499.
+ */
+ if (ns->ns_stopping) {
+ skip = 1;
+ } else {
+ skip = 0;
+ ldlm_namespace_get(ns);
+ }
+ spin_unlock(&ns->ns_lock);
+
+ ldlm_namespace_move_to_active_locked(ns, client);
+ mutex_unlock(ldlm_namespace_lock(client));
+
+ /*
+ * After setup is done - recalc the pool.
+ */
+ if (!skip) {
+ int ttime = ldlm_pool_recalc(&ns->ns_pool);
+
+ if (ttime < time)
+ time = ttime;
+
+ ldlm_namespace_put(ns);
+ }
+ }
+ return time;
+}
+EXPORT_SYMBOL(ldlm_pools_recalc);
+
+static int ldlm_pools_thread_main(void *arg)
+{
+ struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+ int s_time, c_time;
+
+ thread_set_flags(thread, SVC_RUNNING);
+ wake_up(&thread->t_ctl_waitq);
+
+ CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+ "ldlm_poold", current_pid());
+
+ while (1) {
+ struct l_wait_info lwi;
+
+ /*
+ * Recal all pools on this tick.
+ */
+ s_time = ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+ c_time = ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+
+ /*
+ * Wait until the next check time, or until we're
+ * stopped.
+ */
+ lwi = LWI_TIMEOUT(cfs_time_seconds(min(s_time, c_time)),
+ NULL, NULL);
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_stopping(thread) ||
+ thread_is_event(thread),
+ &lwi);
+
+ if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+ break;
+ else
+ thread_test_and_clear_flags(thread, SVC_EVENT);
+ }
+
+ thread_set_flags(thread, SVC_STOPPED);
+ wake_up(&thread->t_ctl_waitq);
+
+ CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+ "ldlm_poold", current_pid());
+
+ complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+ struct l_wait_info lwi = { 0 };
+ struct task_struct *task;
+
+ if (ldlm_pools_thread != NULL)
+ return -EALREADY;
+
+ OBD_ALLOC_PTR(ldlm_pools_thread);
+ if (ldlm_pools_thread == NULL)
+ return -ENOMEM;
+
+ init_completion(&ldlm_pools_comp);
+ init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+ task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+ "ldlm_poold");
+ if (IS_ERR(task)) {
+ CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+ OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+ ldlm_pools_thread = NULL;
+ return PTR_ERR(task);
+ }
+ l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+ thread_is_running(ldlm_pools_thread), &lwi);
+ return 0;
+}
+
+static void ldlm_pools_thread_stop(void)
+{
+ if (ldlm_pools_thread == NULL)
+ return;
+
+ thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+ wake_up(&ldlm_pools_thread->t_ctl_waitq);
+
+ /*
+ * Make sure that pools thread is finished before freeing @thread.
+ * This fixes possible race and oops due to accessing freed memory
+ * in pools thread.
+ */
+ wait_for_completion(&ldlm_pools_comp);
+ OBD_FREE_PTR(ldlm_pools_thread);
+ ldlm_pools_thread = NULL;
+}
+
+static struct shrinker ldlm_pools_srv_shrinker = {
+ .count_objects = ldlm_pools_srv_count,
+ .scan_objects = ldlm_pools_srv_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+static struct shrinker ldlm_pools_cli_shrinker = {
+ .count_objects = ldlm_pools_cli_count,
+ .scan_objects = ldlm_pools_cli_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+int ldlm_pools_init(void)
+{
+ int rc;
+
+ rc = ldlm_pools_thread_start();
+ if (rc == 0) {
+ register_shrinker(&ldlm_pools_srv_shrinker);
+ register_shrinker(&ldlm_pools_cli_shrinker);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_pools_init);
+
+void ldlm_pools_fini(void)
+{
+ unregister_shrinker(&ldlm_pools_srv_shrinker);
+ unregister_shrinker(&ldlm_pools_cli_shrinker);
+ ldlm_pools_thread_stop();
+}
+EXPORT_SYMBOL(ldlm_pools_fini);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
new file mode 100644
index 000000000..4f7131831
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
@@ -0,0 +1,2294 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ * granted immediately due to other conflicting locks on the same resource,
+ * the completion AST is sent to notify the caller when the lock is
+ * eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ * enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ * sent to notify the holder(s) of the lock(s) of the conflicting lock
+ * request. The lock holder(s) must release their lock(s) on that resource in
+ * a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ * (i.e. the lock value block (LVB)) but does not necessarily require holding
+ * the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ * ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ * their lock(s) if they are idle. If the resource is not locked, the server
+ * may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include "../include/obd.h"
+
+#include "ldlm_internal.h"
+
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+module_param(ldlm_enqueue_min, int, 0644);
+MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+static void interrupted_completion_wait(void *data)
+{
+}
+
+struct lock_wait_data {
+ struct ldlm_lock *lwd_lock;
+ __u32 lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+ struct lustre_handle lock_handle;
+};
+
+int ldlm_expired_completion_wait(void *data)
+{
+ struct lock_wait_data *lwd = data;
+ struct ldlm_lock *lock = lwd->lwd_lock;
+ struct obd_import *imp;
+ struct obd_device *obd;
+
+ if (lock->l_conn_export == NULL) {
+ static unsigned long next_dump, last_dump;
+
+ LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", "
+ CFS_DURATION_T"s ago)\n",
+ lock->l_last_activity,
+ cfs_time_sub(get_seconds(),
+ lock->l_last_activity));
+ LDLM_DEBUG(lock, "lock timed out (enqueued at " CFS_TIME_T ", " CFS_DURATION_T "s ago); not entering recovery in server code, just going back to sleep",
+ lock->l_last_activity,
+ cfs_time_sub(get_seconds(),
+ lock->l_last_activity));
+ if (cfs_time_after(cfs_time_current(), next_dump)) {
+ last_dump = next_dump;
+ next_dump = cfs_time_shift(300);
+ ldlm_namespace_dump(D_DLMTRACE,
+ ldlm_lock_to_ns(lock));
+ if (last_dump == 0)
+ libcfs_debug_dumplog();
+ }
+ return 0;
+ }
+
+ obd = lock->l_conn_export->exp_obd;
+ imp = obd->u.cli.cl_import;
+ ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+ LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+ CFS_DURATION_T"s ago), entering recovery for %s@%s",
+ lock->l_last_activity,
+ cfs_time_sub(get_seconds(), lock->l_last_activity),
+ obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_expired_completion_wait);
+
+/* We use the same basis for both server side and client side functions
+ from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+ int timeout = at_get(ldlm_lock_to_ns_at(lock));
+
+ if (AT_OFF)
+ return obd_timeout / 2;
+ /* Since these are non-updating timeouts, we should be conservative.
+ It would be nice to have some kind of "early reply" mechanism for
+ lock callbacks too... */
+ timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
+ return max(timeout, ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_get_enq_timeout);
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock)
+{
+ long delay;
+ int result;
+
+ if (lock->l_flags & (LDLM_FL_DESTROYED | LDLM_FL_FAILED)) {
+ LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+ result = -EIO;
+ } else {
+ delay = cfs_time_sub(get_seconds(),
+ lock->l_last_activity);
+ LDLM_DEBUG(lock, "client-side enqueue: granted after "
+ CFS_DURATION_T"s", delay);
+
+ /* Update our time estimate */
+ at_measured(ldlm_lock_to_ns_at(lock),
+ delay);
+ result = 0;
+ }
+ return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+ if (flags == LDLM_FL_WAIT_NOREPROC) {
+ LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+ return 0;
+ }
+
+ if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+ LDLM_FL_BLOCK_CONV))) {
+ wake_up(&lock->l_waitq);
+ return ldlm_completion_tail(lock);
+ }
+
+ LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, going forward");
+ ldlm_reprocess_all(lock->l_resource);
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ * - when a reply to an ENQUEUE RPC is received from the server
+ * (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ * this point (determined by flags);
+ *
+ * - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ * been granted;
+ *
+ * - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ * gets correct lvb;
+ *
+ * - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ * - during lock conversion (not used currently).
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+ /* XXX ALLOCATE - 160 bytes */
+ struct lock_wait_data lwd;
+ struct obd_device *obd;
+ struct obd_import *imp = NULL;
+ struct l_wait_info lwi;
+ __u32 timeout;
+ int rc = 0;
+
+ if (flags == LDLM_FL_WAIT_NOREPROC) {
+ LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+ goto noreproc;
+ }
+
+ if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+ LDLM_FL_BLOCK_CONV))) {
+ wake_up(&lock->l_waitq);
+ return 0;
+ }
+
+ LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, sleeping");
+
+noreproc:
+
+ obd = class_exp2obd(lock->l_conn_export);
+
+ /* if this is a local lock, then there is no import */
+ if (obd != NULL)
+ imp = obd->u.cli.cl_import;
+
+ /* Wait a long time for enqueue - server may have to callback a
+ lock from another client. Server will evict the other client if it
+ doesn't respond reasonably, and then give us the lock. */
+ timeout = ldlm_get_enq_timeout(lock) * 2;
+
+ lwd.lwd_lock = lock;
+
+ if (lock->l_flags & LDLM_FL_NO_TIMEOUT) {
+ LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+ lwi = LWI_INTR(interrupted_completion_wait, &lwd);
+ } else {
+ lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+ ldlm_expired_completion_wait,
+ interrupted_completion_wait, &lwd);
+ }
+
+ if (imp != NULL) {
+ spin_lock(&imp->imp_lock);
+ lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+ spin_unlock(&imp->imp_lock);
+ }
+
+ if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+ OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+ OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+ lock->l_flags |= LDLM_FL_FAIL_LOC;
+ rc = -EINTR;
+ } else {
+ /* Go to sleep until the lock is granted or cancelled. */
+ rc = l_wait_event(lock->l_waitq,
+ is_granted_or_cancelled(lock), &lwi);
+ }
+
+ if (rc) {
+ LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+ rc);
+ return rc;
+ }
+
+ return ldlm_completion_tail(lock);
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * deferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+ int do_ast;
+
+ lock->l_flags |= LDLM_FL_CBPENDING;
+ do_ast = !lock->l_readers && !lock->l_writers;
+ unlock_res_and_lock(lock);
+
+ if (do_ast) {
+ struct lustre_handle lockh;
+ int rc;
+
+ LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+ ldlm_lock2handle(lock, &lockh);
+ rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+ if (rc < 0)
+ CERROR("ldlm_cli_cancel: %d\n", rc);
+ } else {
+ LDLM_DEBUG(lock, "Lock still has references, will be cancelled later");
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+ void *data, int flag)
+{
+ if (flag == LDLM_CB_CANCELING) {
+ /* Don't need to do anything here. */
+ return 0;
+ }
+
+ lock_res_and_lock(lock);
+ /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
+ * that ldlm_blocking_ast is called just before intent_policy method
+ * takes the lr_lock, then by the time we get the lock, we might not
+ * be the correct blocking function anymore. So check, and return
+ * early, if so. */
+ if (lock->l_blocking_ast != ldlm_blocking_ast) {
+ unlock_res_and_lock(lock);
+ return 0;
+ }
+ return ldlm_blocking_ast_nocheck(lock);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See
+ * comment in filter_intent_policy() on why you may need this.
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+ /*
+ * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for
+ * that is rather subtle: with OST-side locking, it may so happen that
+ * _all_ extent locks are held by the OST. If client wants to obtain
+ * current file size it calls ll{,u}_glimpse_size(), and (as locks are
+ * on the server), dummy glimpse callback fires and does
+ * nothing. Client still receives correct file size due to the
+ * following fragment in filter_intent_policy():
+ *
+ * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB
+ * if (rc != 0 && res->lr_namespace->ns_lvbo &&
+ * res->lr_namespace->ns_lvbo->lvbo_update) {
+ * res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1);
+ * }
+ *
+ * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and
+ * returns correct file size to the client.
+ */
+ return -ELDLM_NO_LOCK_DATA;
+}
+EXPORT_SYMBOL(ldlm_glimpse_ast);
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_type_t type, ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, __u64 *flags,
+ ldlm_blocking_callback blocking,
+ ldlm_completion_callback completion,
+ ldlm_glimpse_callback glimpse,
+ void *data, __u32 lvb_len, enum lvb_type lvb_type,
+ const __u64 *client_cookie,
+ struct lustre_handle *lockh)
+{
+ struct ldlm_lock *lock;
+ int err;
+ const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+ .lcs_blocking = blocking,
+ .lcs_glimpse = glimpse,
+ };
+
+ LASSERT(!(*flags & LDLM_FL_REPLAY));
+ if (unlikely(ns_is_client(ns))) {
+ CERROR("Trying to enqueue local lock in a shadow namespace\n");
+ LBUG();
+ }
+
+ lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+ lvb_type);
+ if (unlikely(!lock)) {
+ err = -ENOMEM;
+ goto out_nolock;
+ }
+
+ ldlm_lock2handle(lock, lockh);
+
+ /* NB: we don't have any lock now (lock_res_and_lock)
+ * because it's a new lock */
+ ldlm_lock_addref_internal_nolock(lock, mode);
+ lock->l_flags |= LDLM_FL_LOCAL;
+ if (*flags & LDLM_FL_ATOMIC_CB)
+ lock->l_flags |= LDLM_FL_ATOMIC_CB;
+
+ if (policy != NULL)
+ lock->l_policy_data = *policy;
+ if (client_cookie != NULL)
+ lock->l_client_cookie = *client_cookie;
+ if (type == LDLM_EXTENT)
+ lock->l_req_extent = policy->l_extent;
+
+ err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+ if (unlikely(err != ELDLM_OK))
+ goto out;
+
+ if (policy != NULL)
+ *policy = lock->l_policy_data;
+
+ if (lock->l_completion_ast)
+ lock->l_completion_ast(lock, *flags, NULL);
+
+ LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+ out:
+ LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+ return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+ struct ldlm_lock *lock, int mode)
+{
+ int need_cancel = 0;
+
+ /* Set a flag to prevent us from sending a CANCEL (bug 407) */
+ lock_res_and_lock(lock);
+ /* Check that lock is not granted or failed, we might race. */
+ if ((lock->l_req_mode != lock->l_granted_mode) &&
+ !(lock->l_flags & LDLM_FL_FAILED)) {
+ /* Make sure that this lock will not be found by raced
+ * bl_ast and -EINVAL reply is sent to server anyways.
+ * bug 17645 */
+ lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
+ LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+ need_cancel = 1;
+ }
+ unlock_res_and_lock(lock);
+
+ if (need_cancel)
+ LDLM_DEBUG(lock,
+ "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+ else
+ LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+ ldlm_lock_decref_internal(lock, mode);
+
+ /* XXX - HACK because we shouldn't call ldlm_lock_destroy()
+ * from llite/file.c/ll_file_flock(). */
+ /* This code makes for the fact that we do not have blocking handler on
+ * a client for flock locks. As such this is the place where we must
+ * completely kill failed locks. (interrupted and those that
+ * were waiting to be granted when server evicted us. */
+ if (lock->l_resource->lr_type == LDLM_FLOCK) {
+ lock_res_and_lock(lock);
+ ldlm_resource_unlink_lock(lock);
+ ldlm_lock_destroy_nolock(lock);
+ unlock_res_and_lock(lock);
+ }
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+ ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+ __u64 *flags, void *lvb, __u32 lvb_len,
+ struct lustre_handle *lockh, int rc)
+{
+ struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+ int is_replay = *flags & LDLM_FL_REPLAY;
+ struct ldlm_lock *lock;
+ struct ldlm_reply *reply;
+ int cleanup_phase = 1;
+ int size = 0;
+
+ lock = ldlm_handle2lock(lockh);
+ /* ldlm_cli_enqueue is holding a reference on this lock. */
+ if (!lock) {
+ LASSERT(type == LDLM_FLOCK);
+ return -ENOLCK;
+ }
+
+ LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+ "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+ if (rc != ELDLM_OK) {
+ LASSERT(!is_replay);
+ LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+ rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+ if (rc != ELDLM_LOCK_ABORTED)
+ goto cleanup;
+ }
+
+ /* Before we return, swab the reply */
+ reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+ if (reply == NULL) {
+ rc = -EPROTO;
+ goto cleanup;
+ }
+
+ if (lvb_len != 0) {
+ LASSERT(lvb != NULL);
+
+ size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+ RCL_SERVER);
+ if (size < 0) {
+ LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+ rc = size;
+ goto cleanup;
+ } else if (unlikely(size > lvb_len)) {
+ LDLM_ERROR(lock, "Replied LVB is larger than expectation, expected = %d, replied = %d",
+ lvb_len, size);
+ rc = -EINVAL;
+ goto cleanup;
+ }
+ }
+
+ if (rc == ELDLM_LOCK_ABORTED) {
+ if (lvb_len != 0)
+ rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+ lvb, size);
+ if (rc == 0)
+ rc = ELDLM_LOCK_ABORTED;
+ goto cleanup;
+ }
+
+ /* lock enqueued on the server */
+ cleanup_phase = 0;
+
+ lock_res_and_lock(lock);
+ /* Key change rehash lock in per-export hash with new key */
+ if (exp->exp_lock_hash) {
+ /* In the function below, .hs_keycmp resolves to
+ * ldlm_export_lock_keycmp() */
+ /* coverity[overrun-buffer-val] */
+ cfs_hash_rehash_key(exp->exp_lock_hash,
+ &lock->l_remote_handle,
+ &reply->lock_handle,
+ &lock->l_exp_hash);
+ } else {
+ lock->l_remote_handle = reply->lock_handle;
+ }
+
+ *flags = ldlm_flags_from_wire(reply->lock_flags);
+ lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+ LDLM_INHERIT_FLAGS);
+ /* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
+ * to wait with no timeout as well */
+ lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+ LDLM_FL_NO_TIMEOUT);
+ unlock_res_and_lock(lock);
+
+ CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: 0x%llx\n",
+ lock, reply->lock_handle.cookie, *flags);
+
+ /* If enqueue returned a blocked lock but the completion handler has
+ * already run, then it fixed up the resource and we don't need to do it
+ * again. */
+ if ((*flags) & LDLM_FL_LOCK_CHANGED) {
+ int newmode = reply->lock_desc.l_req_mode;
+
+ LASSERT(!is_replay);
+ if (newmode && newmode != lock->l_req_mode) {
+ LDLM_DEBUG(lock, "server returned different mode %s",
+ ldlm_lockname[newmode]);
+ lock->l_req_mode = newmode;
+ }
+
+ if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name,
+ &lock->l_resource->lr_name)) {
+ CDEBUG(D_INFO, "remote intent success, locking "DLDLMRES
+ " instead of "DLDLMRES"\n",
+ PLDLMRES(&reply->lock_desc.l_resource),
+ PLDLMRES(lock->l_resource));
+
+ rc = ldlm_lock_change_resource(ns, lock,
+ &reply->lock_desc.l_resource.lr_name);
+ if (rc || lock->l_resource == NULL) {
+ rc = -ENOMEM;
+ goto cleanup;
+ }
+ LDLM_DEBUG(lock, "client-side enqueue, new resource");
+ }
+ if (with_policy)
+ if (!(type == LDLM_IBITS &&
+ !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
+ /* We assume lock type cannot change on server*/
+ ldlm_convert_policy_to_local(exp,
+ lock->l_resource->lr_type,
+ &reply->lock_desc.l_policy_data,
+ &lock->l_policy_data);
+ if (type != LDLM_PLAIN)
+ LDLM_DEBUG(lock,
+ "client-side enqueue, new policy data");
+ }
+
+ if ((*flags) & LDLM_FL_AST_SENT ||
+ /* Cancel extent locks as soon as possible on a liblustre client,
+ * because it cannot handle asynchronous ASTs robustly (see
+ * bug 7311). */
+ (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) {
+ lock_res_and_lock(lock);
+ lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+ unlock_res_and_lock(lock);
+ LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+ }
+
+ /* If the lock has already been granted by a completion AST, don't
+ * clobber the LVB with an older one. */
+ if (lvb_len != 0) {
+ /* We must lock or a racing completion might update lvb without
+ * letting us know and we'll clobber the correct value.
+ * Cannot unlock after the check either, a that still leaves
+ * a tiny window for completion to get in */
+ lock_res_and_lock(lock);
+ if (lock->l_req_mode != lock->l_granted_mode)
+ rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+ lock->l_lvb_data, size);
+ unlock_res_and_lock(lock);
+ if (rc < 0) {
+ cleanup_phase = 1;
+ goto cleanup;
+ }
+ }
+
+ if (!is_replay) {
+ rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+ if (lock->l_completion_ast != NULL) {
+ int err = lock->l_completion_ast(lock, *flags, NULL);
+
+ if (!rc)
+ rc = err;
+ if (rc)
+ cleanup_phase = 1;
+ }
+ }
+
+ if (lvb_len && lvb != NULL) {
+ /* Copy the LVB here, and not earlier, because the completion
+ * AST (if any) can override what we got in the reply */
+ memcpy(lvb, lock->l_lvb_data, lvb_len);
+ }
+
+ LDLM_DEBUG(lock, "client-side enqueue END");
+cleanup:
+ if (cleanup_phase == 1 && rc)
+ failed_lock_cleanup(ns, lock, mode);
+ /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+ LDLM_LOCK_PUT(lock);
+ LDLM_LOCK_RELEASE(lock);
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size. PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+ int avail;
+
+ avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size;
+ if (likely(avail >= 0))
+ avail /= (int)sizeof(struct lustre_handle);
+ else
+ avail = 0;
+ avail += LDLM_LOCKREQ_HANDLES - off;
+
+ return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+ enum req_location loc,
+ int off)
+{
+ int size = req_capsule_msg_size(pill, loc);
+
+ return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+ const struct req_format *fmt,
+ enum req_location loc, int off)
+{
+ int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+
+ return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+ int version, int opc, int canceloff,
+ struct list_head *cancels, int count)
+{
+ struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+ struct req_capsule *pill = &req->rq_pill;
+ struct ldlm_request *dlm = NULL;
+ int flags, avail, to_free, pack = 0;
+ LIST_HEAD(head);
+ int rc;
+
+ if (cancels == NULL)
+ cancels = &head;
+ if (ns_connect_cancelset(ns)) {
+ /* Estimate the amount of available space in the request. */
+ req_capsule_filled_sizes(pill, RCL_CLIENT);
+ avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+ flags = ns_connect_lru_resize(ns) ?
+ LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+ to_free = !ns_connect_lru_resize(ns) &&
+ opc == LDLM_ENQUEUE ? 1 : 0;
+
+ /* Cancel LRU locks here _only_ if the server supports
+ * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+ * RPC, which will make us slower. */
+ if (avail > count)
+ count += ldlm_cancel_lru_local(ns, cancels, to_free,
+ avail - count, 0, flags);
+ if (avail > count)
+ pack = count;
+ else
+ pack = avail;
+ req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+ ldlm_request_bufsize(pack, opc));
+ }
+
+ rc = ptlrpc_request_pack(req, version, opc);
+ if (rc) {
+ ldlm_lock_list_put(cancels, l_bl_ast, count);
+ return rc;
+ }
+
+ if (ns_connect_cancelset(ns)) {
+ if (canceloff) {
+ dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+ LASSERT(dlm);
+ /* Skip first lock handler in ldlm_request_pack(),
+ * this method will increment @lock_count according
+ * to the lock handle amount actually written to
+ * the buffer. */
+ dlm->lock_count = canceloff;
+ }
+ /* Pack into the request @pack lock handles. */
+ ldlm_cli_cancel_list(cancels, pack, req, 0);
+ /* Prepare and send separate cancel RPC for others. */
+ ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+ } else {
+ ldlm_lock_list_put(cancels, l_bl_ast, count);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+ struct list_head *cancels, int count)
+{
+ return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+ LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return ERR_PTR(rc);
+ }
+
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+ ptlrpc_request_set_replen(req);
+ return req;
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+ struct ldlm_enqueue_info *einfo,
+ const struct ldlm_res_id *res_id,
+ ldlm_policy_data_t const *policy, __u64 *flags,
+ void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+ struct lustre_handle *lockh, int async)
+{
+ struct ldlm_namespace *ns;
+ struct ldlm_lock *lock;
+ struct ldlm_request *body;
+ int is_replay = *flags & LDLM_FL_REPLAY;
+ int req_passed_in = 1;
+ int rc, err;
+ struct ptlrpc_request *req;
+
+ LASSERT(exp != NULL);
+
+ ns = exp->exp_obd->obd_namespace;
+
+ /* If we're replaying this lock, just check some invariants.
+ * If we're creating a new lock, get everything all setup nice. */
+ if (is_replay) {
+ lock = ldlm_handle2lock_long(lockh, 0);
+ LASSERT(lock != NULL);
+ LDLM_DEBUG(lock, "client-side enqueue START");
+ LASSERT(exp == lock->l_conn_export);
+ } else {
+ const struct ldlm_callback_suite cbs = {
+ .lcs_completion = einfo->ei_cb_cp,
+ .lcs_blocking = einfo->ei_cb_bl,
+ .lcs_glimpse = einfo->ei_cb_gl
+ };
+ lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+ einfo->ei_mode, &cbs, einfo->ei_cbdata,
+ lvb_len, lvb_type);
+ if (lock == NULL)
+ return -ENOMEM;
+ /* for the local lock, add the reference */
+ ldlm_lock_addref_internal(lock, einfo->ei_mode);
+ ldlm_lock2handle(lock, lockh);
+ if (policy != NULL)
+ lock->l_policy_data = *policy;
+
+ if (einfo->ei_type == LDLM_EXTENT)
+ lock->l_req_extent = policy->l_extent;
+ LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n",
+ *flags);
+ }
+
+ lock->l_conn_export = exp;
+ lock->l_export = NULL;
+ lock->l_blocking_ast = einfo->ei_cb_bl;
+ lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
+
+ /* lock not sent to server yet */
+
+ if (reqp == NULL || *reqp == NULL) {
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_LDLM_ENQUEUE,
+ LUSTRE_DLM_VERSION,
+ LDLM_ENQUEUE);
+ if (req == NULL) {
+ failed_lock_cleanup(ns, lock, einfo->ei_mode);
+ LDLM_LOCK_RELEASE(lock);
+ return -ENOMEM;
+ }
+ req_passed_in = 0;
+ if (reqp)
+ *reqp = req;
+ } else {
+ int len;
+
+ req = *reqp;
+ len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+ RCL_CLIENT);
+ LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+ DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+ }
+
+ /* Dump lock data into the request buffer */
+ body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ ldlm_lock2desc(lock, &body->lock_desc);
+ body->lock_flags = ldlm_flags_to_wire(*flags);
+ body->lock_handle[0] = *lockh;
+
+ /* Continue as normal. */
+ if (!req_passed_in) {
+ if (lvb_len > 0)
+ req_capsule_extend(&req->rq_pill,
+ &RQF_LDLM_ENQUEUE_LVB);
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+ lvb_len);
+ ptlrpc_request_set_replen(req);
+ }
+
+ /*
+ * Liblustre client doesn't get extent locks, except for O_APPEND case
+ * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
+ * [i_size, OBD_OBJECT_EOF] lock is taken.
+ */
+ LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT ||
+ policy->l_extent.end == OBD_OBJECT_EOF));
+
+ if (async) {
+ LASSERT(reqp != NULL);
+ return 0;
+ }
+
+ LDLM_DEBUG(lock, "sending request");
+
+ rc = ptlrpc_queue_wait(req);
+
+ err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
+ einfo->ei_mode, flags, lvb, lvb_len,
+ lockh, rc);
+
+ /* If ldlm_cli_enqueue_fini did not find the lock, we need to free
+ * one reference that we took */
+ if (err == -ENOLCK)
+ LDLM_LOCK_RELEASE(lock);
+ else
+ rc = err;
+
+ if (!req_passed_in && req != NULL) {
+ ptlrpc_req_finished(req);
+ if (reqp)
+ *reqp = NULL;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+ __u32 *flags)
+{
+ struct ldlm_resource *res;
+ int rc;
+
+ if (ns_is_client(ldlm_lock_to_ns(lock))) {
+ CERROR("Trying to cancel local lock\n");
+ LBUG();
+ }
+ LDLM_DEBUG(lock, "client-side local convert");
+
+ res = ldlm_lock_convert(lock, new_mode, flags);
+ if (res) {
+ ldlm_reprocess_all(res);
+ rc = 0;
+ } else {
+ rc = LUSTRE_EDEADLK;
+ }
+ LDLM_DEBUG(lock, "client-side local convert handler END");
+ LDLM_LOCK_PUT(lock);
+ return rc;
+}
+
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+ accounting */
+int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
+{
+ struct ldlm_request *body;
+ struct ldlm_reply *reply;
+ struct ldlm_lock *lock;
+ struct ldlm_resource *res;
+ struct ptlrpc_request *req;
+ int rc;
+
+ lock = ldlm_handle2lock(lockh);
+ if (!lock) {
+ LBUG();
+ return -EINVAL;
+ }
+ *flags = 0;
+
+ if (lock->l_conn_export == NULL)
+ return ldlm_cli_convert_local(lock, new_mode, flags);
+
+ LDLM_DEBUG(lock, "client-side convert");
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+ &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+ LDLM_CONVERT);
+ if (req == NULL) {
+ LDLM_LOCK_PUT(lock);
+ return -ENOMEM;
+ }
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ body->lock_handle[0] = lock->l_remote_handle;
+
+ body->lock_desc.l_req_mode = new_mode;
+ body->lock_flags = ldlm_flags_to_wire(*flags);
+
+
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc != ELDLM_OK)
+ goto out;
+
+ reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+ if (reply == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if (req->rq_status) {
+ rc = req->rq_status;
+ goto out;
+ }
+
+ res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+ if (res != NULL) {
+ ldlm_reprocess_all(res);
+ /* Go to sleep until the lock is granted. */
+ /* FIXME: or cancelled. */
+ if (lock->l_completion_ast) {
+ rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+ NULL);
+ if (rc)
+ goto out;
+ }
+ } else {
+ rc = LUSTRE_EDEADLK;
+ }
+ out:
+ LDLM_LOCK_PUT(lock);
+ ptlrpc_req_finished(req);
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+ __u64 rc = LDLM_FL_LOCAL_ONLY;
+
+ if (lock->l_conn_export) {
+ bool local_only;
+
+ LDLM_DEBUG(lock, "client-side cancel");
+ /* Set this flag to prevent others from getting new references*/
+ lock_res_and_lock(lock);
+ lock->l_flags |= LDLM_FL_CBPENDING;
+ local_only = !!(lock->l_flags &
+ (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+ ldlm_cancel_callback(lock);
+ rc = (lock->l_flags & LDLM_FL_BL_AST) ?
+ LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+ unlock_res_and_lock(lock);
+
+ if (local_only) {
+ CDEBUG(D_DLMTRACE, "not sending request (at caller's instruction)\n");
+ rc = LDLM_FL_LOCAL_ONLY;
+ }
+ ldlm_lock_cancel(lock);
+ } else {
+ if (ns_is_client(ldlm_lock_to_ns(lock))) {
+ LDLM_ERROR(lock, "Trying to cancel local lock");
+ LBUG();
+ }
+ LDLM_DEBUG(lock, "server-side local cancel");
+ ldlm_lock_cancel(lock);
+ ldlm_reprocess_all(lock->l_resource);
+ }
+
+ return rc;
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+ struct list_head *head, int count)
+{
+ struct ldlm_request *dlm;
+ struct ldlm_lock *lock;
+ int max, packed = 0;
+
+ dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ LASSERT(dlm != NULL);
+
+ /* Check the room in the request buffer. */
+ max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+ sizeof(struct ldlm_request);
+ max /= sizeof(struct lustre_handle);
+ max += LDLM_LOCKREQ_HANDLES;
+ LASSERT(max >= dlm->lock_count + count);
+
+ /* XXX: it would be better to pack lock handles grouped by resource.
+ * so that the server cancel would call filter_lvbo_update() less
+ * frequently. */
+ list_for_each_entry(lock, head, l_bl_ast) {
+ if (!count--)
+ break;
+ LASSERT(lock->l_conn_export);
+ /* Pack the lock handle to the given request buffer. */
+ LDLM_DEBUG(lock, "packing");
+ dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+ packed++;
+ }
+ CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list. */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+ int count, ldlm_cancel_flags_t flags)
+{
+ struct ptlrpc_request *req = NULL;
+ struct obd_import *imp;
+ int free, sent = 0;
+ int rc = 0;
+
+ LASSERT(exp != NULL);
+ LASSERT(count > 0);
+
+ CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+ if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+ return count;
+
+ free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+ &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+ if (count > free)
+ count = free;
+
+ while (1) {
+ imp = class_exp2cliimp(exp);
+ if (imp == NULL || imp->imp_invalid) {
+ CDEBUG(D_DLMTRACE,
+ "skipping cancel on invalid import %p\n", imp);
+ return count;
+ }
+
+ req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+ ldlm_request_bufsize(count, LDLM_CANCEL));
+
+ rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+ if (rc) {
+ ptlrpc_request_free(req);
+ goto out;
+ }
+
+ req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+ req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
+ ldlm_cancel_pack(req, cancels, count);
+
+ ptlrpc_request_set_replen(req);
+ if (flags & LCF_ASYNC) {
+ ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+ sent = count;
+ goto out;
+ } else {
+ rc = ptlrpc_queue_wait(req);
+ }
+ if (rc == LUSTRE_ESTALE) {
+ CDEBUG(D_DLMTRACE, "client/server (nid %s) out of sync -- not fatal\n",
+ libcfs_nid2str(req->rq_import->
+ imp_connection->c_peer.nid));
+ rc = 0;
+ } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+ req->rq_import_generation == imp->imp_generation) {
+ ptlrpc_req_finished(req);
+ continue;
+ } else if (rc != ELDLM_OK) {
+ /* -ESHUTDOWN is common on umount */
+ CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+ "Got rc %d from cancel RPC: canceling anyway\n",
+ rc);
+ break;
+ }
+ sent = count;
+ break;
+ }
+
+ ptlrpc_req_finished(req);
+out:
+ return sent ? sent : rc;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_req);
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+ LASSERT(imp != NULL);
+ return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+ struct obd_device *obd;
+ __u64 new_slv;
+ __u32 new_limit;
+
+ if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+ !imp_connect_lru_resize(req->rq_import))) {
+ /*
+ * Do nothing for corner cases.
+ */
+ return 0;
+ }
+
+ /* In some cases RPC may contain SLV and limit zeroed out. This
+ * is the case when server does not support LRU resize feature.
+ * This is also possible in some recovery cases when server-side
+ * reqs have no reference to the OBD export and thus access to
+ * server-side namespace is not possible. */
+ if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+ lustre_msg_get_limit(req->rq_repmsg) == 0) {
+ DEBUG_REQ(D_HA, req,
+ "Zero SLV or Limit found (SLV: %llu, Limit: %u)",
+ lustre_msg_get_slv(req->rq_repmsg),
+ lustre_msg_get_limit(req->rq_repmsg));
+ return 0;
+ }
+
+ new_limit = lustre_msg_get_limit(req->rq_repmsg);
+ new_slv = lustre_msg_get_slv(req->rq_repmsg);
+ obd = req->rq_import->imp_obd;
+
+ /* Set new SLV and limit in OBD fields to make them accessible
+ * to the pool thread. We do not access obd_namespace and pool
+ * directly here as there is no reliable way to make sure that
+ * they are still alive at cleanup time. Evil races are possible
+ * which may cause Oops at that time. */
+ write_lock(&obd->obd_pool_lock);
+ obd->obd_pool_slv = new_slv;
+ obd->obd_pool_limit = new_limit;
+ write_unlock(&obd->obd_pool_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_update_pool);
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+ ldlm_cancel_flags_t cancel_flags)
+{
+ struct obd_export *exp;
+ int avail, flags, count = 1;
+ __u64 rc = 0;
+ struct ldlm_namespace *ns;
+ struct ldlm_lock *lock;
+ LIST_HEAD(cancels);
+
+ /* concurrent cancels on the same handle can happen */
+ lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING);
+ if (lock == NULL) {
+ LDLM_DEBUG_NOLOCK("lock is already being destroyed\n");
+ return 0;
+ }
+
+ rc = ldlm_cli_cancel_local(lock);
+ if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) {
+ LDLM_LOCK_RELEASE(lock);
+ return 0;
+ }
+ /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+ * RPC which goes to canceld portal, so we can cancel other LRU locks
+ * here and send them all as one LDLM_CANCEL RPC. */
+ LASSERT(list_empty(&lock->l_bl_ast));
+ list_add(&lock->l_bl_ast, &cancels);
+
+ exp = lock->l_conn_export;
+ if (exp_connect_cancelset(exp)) {
+ avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+ &RQF_LDLM_CANCEL,
+ RCL_CLIENT, 0);
+ LASSERT(avail > 0);
+
+ ns = ldlm_lock_to_ns(lock);
+ flags = ns_connect_lru_resize(ns) ?
+ LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+ count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+ LCF_BL_AST, flags);
+ }
+ ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+ ldlm_cancel_flags_t flags)
+{
+ LIST_HEAD(head);
+ struct ldlm_lock *lock, *next;
+ int left = 0, bl_ast = 0;
+ __u64 rc;
+
+ left = count;
+ list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+ if (left-- == 0)
+ break;
+
+ if (flags & LCF_LOCAL) {
+ rc = LDLM_FL_LOCAL_ONLY;
+ ldlm_lock_cancel(lock);
+ } else {
+ rc = ldlm_cli_cancel_local(lock);
+ }
+ /* Until we have compound requests and can send LDLM_CANCEL
+ * requests batched with generic RPCs, we need to send cancels
+ * with the LDLM_FL_BL_AST flag in a separate RPC from
+ * the one being generated now. */
+ if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+ LDLM_DEBUG(lock, "Cancel lock separately");
+ list_del_init(&lock->l_bl_ast);
+ list_add(&lock->l_bl_ast, &head);
+ bl_ast++;
+ continue;
+ }
+ if (rc == LDLM_FL_LOCAL_ONLY) {
+ /* CANCEL RPC should not be sent to server. */
+ list_del_init(&lock->l_bl_ast);
+ LDLM_LOCK_RELEASE(lock);
+ count--;
+ }
+ }
+ if (bl_ast > 0) {
+ count -= bl_ast;
+ ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+ }
+
+ return count;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list_local);
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+ struct ldlm_lock *lock,
+ int unused, int added,
+ int count)
+{
+ ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+ ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+
+ lock_res_and_lock(lock);
+
+ /* don't check added & count since we want to process all locks
+ * from unused list */
+ switch (lock->l_resource->lr_type) {
+ case LDLM_EXTENT:
+ case LDLM_IBITS:
+ if (cb && cb(lock))
+ break;
+ default:
+ result = LDLM_POLICY_SKIP_LOCK;
+ lock->l_flags |= LDLM_FL_SKIPPED;
+ break;
+ }
+
+ unlock_res_and_lock(lock);
+ return result;
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+ struct ldlm_lock *lock,
+ int unused, int added,
+ int count)
+{
+ unsigned long cur = cfs_time_current();
+ struct ldlm_pool *pl = &ns->ns_pool;
+ __u64 slv, lvf, lv;
+ unsigned long la;
+
+ /* Stop LRU processing when we reach past @count or have checked all
+ * locks in LRU. */
+ if (count && added >= count)
+ return LDLM_POLICY_KEEP_LOCK;
+
+ slv = ldlm_pool_get_slv(pl);
+ lvf = ldlm_pool_get_lvf(pl);
+ la = cfs_duration_sec(cfs_time_sub(cur,
+ lock->l_last_used));
+ lv = lvf * la * unused;
+
+ /* Inform pool about current CLV to see it via proc. */
+ ldlm_pool_set_clv(pl, lv);
+
+ /* Stop when SLV is not yet come from server or lv is smaller than
+ * it is. */
+ return (slv == 0 || lv < slv) ?
+ LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+ struct ldlm_lock *lock,
+ int unused, int added,
+ int count)
+{
+ /* Stop LRU processing when we reach past @count or have checked all
+ * locks in LRU. */
+ return (added >= count) ?
+ LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+ struct ldlm_lock *lock,
+ int unused, int added,
+ int count)
+{
+ /* Stop LRU processing if young lock is found and we reach past count */
+ return ((added >= count) &&
+ time_before(cfs_time_current(),
+ cfs_time_add(lock->l_last_used, ns->ns_max_age))) ?
+ LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+ struct ldlm_lock *lock,
+ int unused, int added,
+ int count)
+{
+ /* Stop LRU processing when we reach past count or have checked all
+ * locks in LRU. */
+ return (added >= count) ?
+ LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
+ struct ldlm_lock *, int,
+ int, int);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
+{
+ if (flags & LDLM_CANCEL_NO_WAIT)
+ return ldlm_cancel_no_wait_policy;
+
+ if (ns_connect_lru_resize(ns)) {
+ if (flags & LDLM_CANCEL_SHRINK)
+ /* We kill passed number of old locks. */
+ return ldlm_cancel_passed_policy;
+ else if (flags & LDLM_CANCEL_LRUR)
+ return ldlm_cancel_lrur_policy;
+ else if (flags & LDLM_CANCEL_PASSED)
+ return ldlm_cancel_passed_policy;
+ } else {
+ if (flags & LDLM_CANCEL_AGED)
+ return ldlm_cancel_aged_policy;
+ }
+
+ return ldlm_cancel_default_policy;
+}
+
+/**
+ * - Free space in LRU for \a count new locks,
+ * redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL. There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Calling policies for enabled LRU resize:
+ * ----------------------------------------
+ * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to
+ * cancel not more than \a count locks;
+ *
+ * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at
+ * the beginning of LRU list);
+ *
+ * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to
+ * memory pressure policy function;
+ *
+ * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ * (typically before replaying locks) w/o
+ * sending any RPCs or waiting for any
+ * outstanding RPC to complete.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
+ struct list_head *cancels, int count, int max,
+ int flags)
+{
+ ldlm_cancel_lru_policy_t pf;
+ struct ldlm_lock *lock, *next;
+ int added = 0, unused, remained;
+
+ spin_lock(&ns->ns_lock);
+ unused = ns->ns_nr_unused;
+ remained = unused;
+
+ if (!ns_connect_lru_resize(ns))
+ count += unused - ns->ns_max_unused;
+
+ pf = ldlm_cancel_lru_policy(ns, flags);
+ LASSERT(pf != NULL);
+
+ while (!list_empty(&ns->ns_unused_list)) {
+ ldlm_policy_res_t result;
+
+ /* all unused locks */
+ if (remained-- <= 0)
+ break;
+
+ /* For any flags, stop scanning if @max is reached. */
+ if (max && added >= max)
+ break;
+
+ list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+ l_lru) {
+ /* No locks which got blocking requests. */
+ LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
+
+ if (flags & LDLM_CANCEL_NO_WAIT &&
+ lock->l_flags & LDLM_FL_SKIPPED)
+ /* already processed */
+ continue;
+
+ /* Somebody is already doing CANCEL. No need for this
+ * lock in LRU, do not traverse it again. */
+ if (!(lock->l_flags & LDLM_FL_CANCELING))
+ break;
+
+ ldlm_lock_remove_from_lru_nolock(lock);
+ }
+ if (&lock->l_lru == &ns->ns_unused_list)
+ break;
+
+ LDLM_LOCK_GET(lock);
+ spin_unlock(&ns->ns_lock);
+ lu_ref_add(&lock->l_reference, __func__, current);
+
+ /* Pass the lock through the policy filter and see if it
+ * should stay in LRU.
+ *
+ * Even for shrinker policy we stop scanning if
+ * we find a lock that should stay in the cache.
+ * We should take into account lock age anyway
+ * as a new lock is a valuable resource even if
+ * it has a low weight.
+ *
+ * That is, for shrinker policy we drop only
+ * old locks, but additionally choose them by
+ * their weight. Big extent locks will stay in
+ * the cache. */
+ result = pf(ns, lock, unused, added, count);
+ if (result == LDLM_POLICY_KEEP_LOCK) {
+ lu_ref_del(&lock->l_reference,
+ __func__, current);
+ LDLM_LOCK_RELEASE(lock);
+ spin_lock(&ns->ns_lock);
+ break;
+ }
+ if (result == LDLM_POLICY_SKIP_LOCK) {
+ lu_ref_del(&lock->l_reference,
+ __func__, current);
+ LDLM_LOCK_RELEASE(lock);
+ spin_lock(&ns->ns_lock);
+ continue;
+ }
+
+ lock_res_and_lock(lock);
+ /* Check flags again under the lock. */
+ if ((lock->l_flags & LDLM_FL_CANCELING) ||
+ (ldlm_lock_remove_from_lru(lock) == 0)) {
+ /* Another thread is removing lock from LRU, or
+ * somebody is already doing CANCEL, or there
+ * is a blocking request which will send cancel
+ * by itself, or the lock is no longer unused. */
+ unlock_res_and_lock(lock);
+ lu_ref_del(&lock->l_reference,
+ __func__, current);
+ LDLM_LOCK_RELEASE(lock);
+ spin_lock(&ns->ns_lock);
+ continue;
+ }
+ LASSERT(!lock->l_readers && !lock->l_writers);
+
+ /* If we have chosen to cancel this lock voluntarily, we
+ * better send cancel notification to server, so that it
+ * frees appropriate state. This might lead to a race
+ * where while we are doing cancel here, server is also
+ * silently cancelling this lock. */
+ lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
+ /* Setting the CBPENDING flag is a little misleading,
+ * but prevents an important race; namely, once
+ * CBPENDING is set, the lock can accumulate no more
+ * readers/writers. Since readers and writers are
+ * already zero here, ldlm_lock_decref() won't see
+ * this flag and call l_blocking_ast */
+ lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+ /* We can't re-add to l_lru as it confuses the
+ * refcounting in ldlm_lock_remove_from_lru() if an AST
+ * arrives after we drop lr_lock below. We use l_bl_ast
+ * and can't use l_pending_chain as it is used both on
+ * server and client nevertheless bug 5666 says it is
+ * used only on server */
+ LASSERT(list_empty(&lock->l_bl_ast));
+ list_add(&lock->l_bl_ast, cancels);
+ unlock_res_and_lock(lock);
+ lu_ref_del(&lock->l_reference, __func__, current);
+ spin_lock(&ns->ns_lock);
+ added++;
+ unused--;
+ }
+ spin_unlock(&ns->ns_lock);
+ return added;
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+ int count, int max, ldlm_cancel_flags_t cancel_flags,
+ int flags)
+{
+ int added;
+
+ added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+ if (added <= 0)
+ return added;
+ return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a nr locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback. When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+ ldlm_cancel_flags_t cancel_flags,
+ int flags)
+{
+ LIST_HEAD(cancels);
+ int count, rc;
+
+ /* Just prepare the list of locks, do not actually cancel them yet.
+ * Locks are cancelled later in a separate thread. */
+ count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+ rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+ if (rc == 0)
+ return count;
+
+ return 0;
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+ struct list_head *cancels,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode, __u64 lock_flags,
+ ldlm_cancel_flags_t cancel_flags, void *opaque)
+{
+ struct ldlm_lock *lock;
+ int count = 0;
+
+ lock_res(res);
+ list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+ if (opaque != NULL && lock->l_ast_data != opaque) {
+ LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+ lock->l_ast_data, opaque);
+ continue;
+ }
+
+ if (lock->l_readers || lock->l_writers)
+ continue;
+
+ /* If somebody is already doing CANCEL, or blocking AST came,
+ * skip this lock. */
+ if (lock->l_flags & LDLM_FL_BL_AST ||
+ lock->l_flags & LDLM_FL_CANCELING)
+ continue;
+
+ if (lockmode_compat(lock->l_granted_mode, mode))
+ continue;
+
+ /* If policy is given and this is IBITS lock, add to list only
+ * those locks that match by policy. */
+ if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+ !(lock->l_policy_data.l_inodebits.bits &
+ policy->l_inodebits.bits))
+ continue;
+
+ /* See CBPENDING comment in ldlm_cancel_lru */
+ lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+ lock_flags;
+
+ LASSERT(list_empty(&lock->l_bl_ast));
+ list_add(&lock->l_bl_ast, cancels);
+ LDLM_LOCK_GET(lock);
+ count++;
+ }
+ unlock_res(res);
+
+ return ldlm_cli_cancel_list_local(cancels, count, cancel_flags);
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+ struct ptlrpc_request *req, ldlm_cancel_flags_t flags)
+{
+ struct ldlm_lock *lock;
+ int res = 0;
+
+ if (list_empty(cancels) || count == 0)
+ return 0;
+
+ /* XXX: requests (both batched and not) could be sent in parallel.
+ * Usually it is enough to have just 1 RPC, but it is possible that
+ * there are too many locks to be cancelled in LRU or on a resource.
+ * It would also speed up the case when the server does not support
+ * the feature. */
+ while (count > 0) {
+ LASSERT(!list_empty(cancels));
+ lock = list_entry(cancels->next, struct ldlm_lock,
+ l_bl_ast);
+ LASSERT(lock->l_conn_export);
+
+ if (exp_connect_cancelset(lock->l_conn_export)) {
+ res = count;
+ if (req)
+ ldlm_cancel_pack(req, cancels, count);
+ else
+ res = ldlm_cli_cancel_req(lock->l_conn_export,
+ cancels, count,
+ flags);
+ } else {
+ res = ldlm_cli_cancel_req(lock->l_conn_export,
+ cancels, 1, flags);
+ }
+
+ if (res < 0) {
+ CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+ "ldlm_cli_cancel_list: %d\n", res);
+ res = count;
+ }
+
+ count -= res;
+ ldlm_lock_list_put(cancels, l_bl_ast, res);
+ }
+ LASSERT(count == 0);
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags,
+ void *opaque)
+{
+ struct ldlm_resource *res;
+ LIST_HEAD(cancels);
+ int count;
+ int rc;
+
+ res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+ if (res == NULL) {
+ /* This is not a problem. */
+ CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]);
+ return 0;
+ }
+
+ LDLM_RESOURCE_ADDREF(res);
+ count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+ 0, flags | LCF_BL_AST, opaque);
+ rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+ if (rc != ELDLM_OK)
+ CERROR("canceling unused lock "DLDLMRES": rc = %d\n",
+ PLDLMRES(res), rc);
+
+ LDLM_RESOURCE_DELREF(res);
+ ldlm_resource_putref(res);
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+ int lc_flags;
+ void *lc_opaque;
+};
+
+static int ldlm_cli_hash_cancel_unused(struct cfs_hash *hs,
+ struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg)
+{
+ struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+ struct ldlm_cli_cancel_arg *lc = arg;
+
+ ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+ NULL, LCK_MINMODE,
+ lc->lc_flags, lc->lc_opaque);
+ /* must return 0 for hash iteration */
+ return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_cancel_flags_t flags, void *opaque)
+{
+ struct ldlm_cli_cancel_arg arg = {
+ .lc_flags = flags,
+ .lc_opaque = opaque,
+ };
+
+ if (ns == NULL)
+ return ELDLM_OK;
+
+ if (res_id != NULL) {
+ return ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+ LCK_MINMODE, flags,
+ opaque);
+ } else {
+ cfs_hash_for_each_nolock(ns->ns_rs_hash,
+ ldlm_cli_hash_cancel_unused, &arg);
+ return ELDLM_OK;
+ }
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+ void *closure)
+{
+ struct list_head *tmp, *next;
+ struct ldlm_lock *lock;
+ int rc = LDLM_ITER_CONTINUE;
+
+ if (!res)
+ return LDLM_ITER_CONTINUE;
+
+ lock_res(res);
+ list_for_each_safe(tmp, next, &res->lr_granted) {
+ lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+ if (iter(lock, closure) == LDLM_ITER_STOP) {
+ rc = LDLM_ITER_STOP;
+ goto out;
+ }
+ }
+
+ list_for_each_safe(tmp, next, &res->lr_converting) {
+ lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+ if (iter(lock, closure) == LDLM_ITER_STOP) {
+ rc = LDLM_ITER_STOP;
+ goto out;
+ }
+ }
+
+ list_for_each_safe(tmp, next, &res->lr_waiting) {
+ lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+ if (iter(lock, closure) == LDLM_ITER_STOP) {
+ rc = LDLM_ITER_STOP;
+ goto out;
+ }
+ }
+ out:
+ unlock_res(res);
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_resource_foreach);
+
+struct iter_helper_data {
+ ldlm_iterator_t iter;
+ void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+ struct iter_helper_data *helper = closure;
+
+ return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg)
+
+{
+ struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+ return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+ LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+ ldlm_iterator_t iter, void *closure)
+
+{
+ struct iter_helper_data helper = {
+ .iter = iter,
+ .closure = closure,
+ };
+
+ cfs_hash_for_each_nolock(ns->ns_rs_hash,
+ ldlm_res_iter_helper, &helper);
+
+}
+EXPORT_SYMBOL(ldlm_namespace_foreach);
+
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return 0: find no resource
+ * > 0: must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ * < 0: errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+ const struct ldlm_res_id *res_id,
+ ldlm_iterator_t iter, void *data)
+{
+ struct ldlm_resource *res;
+ int rc;
+
+ if (ns == NULL) {
+ CERROR("must pass in namespace\n");
+ LBUG();
+ }
+
+ res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+ if (res == NULL)
+ return 0;
+
+ LDLM_RESOURCE_ADDREF(res);
+ rc = ldlm_resource_foreach(res, iter, data);
+ LDLM_RESOURCE_DELREF(res);
+ ldlm_resource_putref(res);
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+ struct list_head *list = closure;
+
+ /* we use l_pending_chain here, because it's unused on clients. */
+ LASSERTF(list_empty(&lock->l_pending_chain),
+ "lock %p next %p prev %p\n",
+ lock, &lock->l_pending_chain.next,
+ &lock->l_pending_chain.prev);
+ /* bug 9573: don't replay locks left after eviction, or
+ * bug 17614: locks being actively cancelled. Get a reference
+ * on a lock so that it does not disappear under us (e.g. due to cancel)
+ */
+ if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+ list_add(&lock->l_pending_chain, list);
+ LDLM_LOCK_GET(lock);
+ }
+
+ return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ struct ldlm_async_args *aa, int rc)
+{
+ struct ldlm_lock *lock;
+ struct ldlm_reply *reply;
+ struct obd_export *exp;
+
+ atomic_dec(&req->rq_import->imp_replay_inflight);
+ if (rc != ELDLM_OK)
+ goto out;
+
+
+ reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+ if (reply == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ lock = ldlm_handle2lock(&aa->lock_handle);
+ if (!lock) {
+ CERROR("received replay ack for unknown local cookie %#llx remote cookie %#llx from server %s id %s\n",
+ aa->lock_handle.cookie, reply->lock_handle.cookie,
+ req->rq_export->exp_client_uuid.uuid,
+ libcfs_id2str(req->rq_peer));
+ rc = -ESTALE;
+ goto out;
+ }
+
+ /* Key change rehash lock in per-export hash with new key */
+ exp = req->rq_export;
+ if (exp && exp->exp_lock_hash) {
+ /* In the function below, .hs_keycmp resolves to
+ * ldlm_export_lock_keycmp() */
+ /* coverity[overrun-buffer-val] */
+ cfs_hash_rehash_key(exp->exp_lock_hash,
+ &lock->l_remote_handle,
+ &reply->lock_handle,
+ &lock->l_exp_hash);
+ } else {
+ lock->l_remote_handle = reply->lock_handle;
+ }
+
+ LDLM_DEBUG(lock, "replayed lock:");
+ ptlrpc_import_recovery_state_machine(req->rq_import);
+ LDLM_LOCK_PUT(lock);
+out:
+ if (rc != ELDLM_OK)
+ ptlrpc_connect_import(req->rq_import);
+
+ return rc;
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+ struct ptlrpc_request *req;
+ struct ldlm_async_args *aa;
+ struct ldlm_request *body;
+ int flags;
+
+ /* Bug 11974: Do not replay a lock which is actively being canceled */
+ if (lock->l_flags & LDLM_FL_CANCELING) {
+ LDLM_DEBUG(lock, "Not replaying canceled lock:");
+ return 0;
+ }
+
+ /* If this is reply-less callback lock, we cannot replay it, since
+ * server might have long dropped it, but notification of that event was
+ * lost by network. (and server granted conflicting lock already) */
+ if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+ LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+ ldlm_lock_cancel(lock);
+ return 0;
+ }
+
+ /*
+ * If granted mode matches the requested mode, this lock is granted.
+ *
+ * If they differ, but we have a granted mode, then we were granted
+ * one mode and now want another: ergo, converting.
+ *
+ * If we haven't been granted anything and are on a resource list,
+ * then we're blocked/waiting.
+ *
+ * If we haven't been granted anything and we're NOT on a resource list,
+ * then we haven't got a reply yet and don't have a known disposition.
+ * This happens whenever a lock enqueue is the request that triggers
+ * recovery.
+ */
+ if (lock->l_granted_mode == lock->l_req_mode)
+ flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+ else if (lock->l_granted_mode)
+ flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+ else if (!list_empty(&lock->l_res_link))
+ flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+ else
+ flags = LDLM_FL_REPLAY;
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+ LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+ if (req == NULL)
+ return -ENOMEM;
+
+ /* We're part of recovery, so don't wait for it. */
+ req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ ldlm_lock2desc(lock, &body->lock_desc);
+ body->lock_flags = ldlm_flags_to_wire(flags);
+
+ ldlm_lock2handle(lock, &body->lock_handle[0]);
+ if (lock->l_lvb_len > 0)
+ req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+ lock->l_lvb_len);
+ ptlrpc_request_set_replen(req);
+ /* notify the server we've replayed all requests.
+ * also, we mark the request to be put on a dedicated
+ * queue to be processed after all request replayes.
+ * bug 6063 */
+ lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+ LDLM_DEBUG(lock, "replaying lock:");
+
+ atomic_inc(&req->rq_import->imp_replay_inflight);
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ aa->lock_handle = body->lock_handle[0];
+ req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
+ ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+ return 0;
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+ int canceled;
+ LIST_HEAD(cancels);
+
+ CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before replay for namespace %s (%d)\n",
+ ldlm_ns_name(ns), ns->ns_nr_unused);
+
+ /* We don't need to care whether or not LRU resize is enabled
+ * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+ * count parameter */
+ canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+ LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+ CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+ canceled, ldlm_ns_name(ns));
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+ struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+ LIST_HEAD(list);
+ struct ldlm_lock *lock, *next;
+ int rc = 0;
+
+ LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+
+ /* don't replay locks if import failed recovery */
+ if (imp->imp_vbr_failed)
+ return 0;
+
+ /* ensure this doesn't fall to 0 before all have been queued */
+ atomic_inc(&imp->imp_replay_inflight);
+
+ if (ldlm_cancel_unused_locks_before_replay)
+ ldlm_cancel_unused_locks_for_replay(ns);
+
+ ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+ list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+ list_del_init(&lock->l_pending_chain);
+ if (rc) {
+ LDLM_LOCK_RELEASE(lock);
+ continue; /* or try to do the rest? */
+ }
+ rc = replay_one_lock(imp, lock);
+ LDLM_LOCK_RELEASE(lock);
+ }
+
+ atomic_dec(&imp->imp_replay_inflight);
+
+ return rc;
+}
+EXPORT_SYMBOL(ldlm_replay_locks);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
new file mode 100644
index 000000000..f750d42a7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
@@ -0,0 +1,1425 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_fid.h"
+#include "../include/obd_class.h"
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+
+int ldlm_srv_namespace_nr = 0;
+int ldlm_cli_namespace_nr = 0;
+
+struct mutex ldlm_srv_namespace_lock;
+LIST_HEAD(ldlm_srv_namespace_list);
+
+struct mutex ldlm_cli_namespace_lock;
+/* Client Namespaces that have active resources in them.
+ * Once all resources go away, ldlm_poold moves such namespaces to the
+ * inactive list */
+LIST_HEAD(ldlm_cli_active_namespace_list);
+/* Client namespaces that don't have any locks in them */
+LIST_HEAD(ldlm_cli_inactive_namespace_list);
+
+struct proc_dir_entry *ldlm_type_proc_dir = NULL;
+static struct proc_dir_entry *ldlm_ns_proc_dir = NULL;
+struct proc_dir_entry *ldlm_svc_proc_dir = NULL;
+
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+unsigned int ldlm_dump_granted_max = 256;
+
+#if defined(CONFIG_PROC_FS)
+static ssize_t lprocfs_wr_dump_ns(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+ ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+ return count;
+}
+LPROC_SEQ_FOPS_WR_ONLY(ldlm, dump_ns);
+
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm, uint);
+
+int ldlm_proc_setup(void)
+{
+ int rc;
+ struct lprocfs_vars list[] = {
+ { "dump_namespaces", &ldlm_dump_ns_fops, NULL, 0222 },
+ { "dump_granted_max", &ldlm_rw_uint_fops,
+ &ldlm_dump_granted_max },
+ { "cancel_unused_locks_before_replay", &ldlm_rw_uint_fops,
+ &ldlm_cancel_unused_locks_before_replay },
+ { NULL } };
+ LASSERT(ldlm_ns_proc_dir == NULL);
+
+ ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+ proc_lustre_root,
+ NULL, NULL);
+ if (IS_ERR(ldlm_type_proc_dir)) {
+ CERROR("LProcFS failed in ldlm-init\n");
+ rc = PTR_ERR(ldlm_type_proc_dir);
+ goto err;
+ }
+
+ ldlm_ns_proc_dir = lprocfs_register("namespaces",
+ ldlm_type_proc_dir,
+ NULL, NULL);
+ if (IS_ERR(ldlm_ns_proc_dir)) {
+ CERROR("LProcFS failed in ldlm-init\n");
+ rc = PTR_ERR(ldlm_ns_proc_dir);
+ goto err_type;
+ }
+
+ ldlm_svc_proc_dir = lprocfs_register("services",
+ ldlm_type_proc_dir,
+ NULL, NULL);
+ if (IS_ERR(ldlm_svc_proc_dir)) {
+ CERROR("LProcFS failed in ldlm-init\n");
+ rc = PTR_ERR(ldlm_svc_proc_dir);
+ goto err_ns;
+ }
+
+ rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+
+ return 0;
+
+err_ns:
+ lprocfs_remove(&ldlm_ns_proc_dir);
+err_type:
+ lprocfs_remove(&ldlm_type_proc_dir);
+err:
+ ldlm_svc_proc_dir = NULL;
+ ldlm_type_proc_dir = NULL;
+ ldlm_ns_proc_dir = NULL;
+ return rc;
+}
+
+void ldlm_proc_cleanup(void)
+{
+ if (ldlm_svc_proc_dir)
+ lprocfs_remove(&ldlm_svc_proc_dir);
+
+ if (ldlm_ns_proc_dir)
+ lprocfs_remove(&ldlm_ns_proc_dir);
+
+ if (ldlm_type_proc_dir)
+ lprocfs_remove(&ldlm_type_proc_dir);
+
+ ldlm_svc_proc_dir = NULL;
+ ldlm_type_proc_dir = NULL;
+ ldlm_ns_proc_dir = NULL;
+}
+
+static int lprocfs_ns_resources_seq_show(struct seq_file *m, void *v)
+{
+ struct ldlm_namespace *ns = m->private;
+ __u64 res = 0;
+ struct cfs_hash_bd bd;
+ int i;
+
+ /* result is not strictly consistent */
+ cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i)
+ res += cfs_hash_bd_count_get(&bd);
+ return lprocfs_rd_u64(m, &res);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_resources);
+
+static int lprocfs_ns_locks_seq_show(struct seq_file *m, void *v)
+{
+ struct ldlm_namespace *ns = m->private;
+ __u64 locks;
+
+ locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+ LPROCFS_FIELDS_FLAGS_SUM);
+ return lprocfs_rd_u64(m, &locks);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_locks);
+
+static int lprocfs_lru_size_seq_show(struct seq_file *m, void *v)
+{
+ struct ldlm_namespace *ns = m->private;
+ __u32 *nr = &ns->ns_max_unused;
+
+ if (ns_connect_lru_resize(ns))
+ nr = &ns->ns_nr_unused;
+ return lprocfs_rd_uint(m, nr);
+}
+
+static ssize_t lprocfs_lru_size_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+ char dummy[MAX_STRING_SIZE + 1];
+ unsigned long tmp;
+ int lru_resize;
+ int err;
+
+ dummy[MAX_STRING_SIZE] = '\0';
+ if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+ return -EFAULT;
+
+ if (strncmp(dummy, "clear", 5) == 0) {
+ CDEBUG(D_DLMTRACE,
+ "dropping all unused locks from namespace %s\n",
+ ldlm_ns_name(ns));
+ if (ns_connect_lru_resize(ns)) {
+ int canceled, unused = ns->ns_nr_unused;
+
+ /* Try to cancel all @ns_nr_unused locks. */
+ canceled = ldlm_cancel_lru(ns, unused, 0,
+ LDLM_CANCEL_PASSED);
+ if (canceled < unused) {
+ CDEBUG(D_DLMTRACE,
+ "not all requested locks are canceled, requested: %d, canceled: %d\n",
+ unused,
+ canceled);
+ return -EINVAL;
+ }
+ } else {
+ tmp = ns->ns_max_unused;
+ ns->ns_max_unused = 0;
+ ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED);
+ ns->ns_max_unused = tmp;
+ }
+ return count;
+ }
+
+ err = kstrtoul(dummy, 10, &tmp);
+ if (err != 0) {
+ CERROR("invalid value written\n");
+ return -EINVAL;
+ }
+ lru_resize = (tmp == 0);
+
+ if (ns_connect_lru_resize(ns)) {
+ if (!lru_resize)
+ ns->ns_max_unused = (unsigned int)tmp;
+
+ if (tmp > ns->ns_nr_unused)
+ tmp = ns->ns_nr_unused;
+ tmp = ns->ns_nr_unused - tmp;
+
+ CDEBUG(D_DLMTRACE,
+ "changing namespace %s unused locks from %u to %u\n",
+ ldlm_ns_name(ns), ns->ns_nr_unused,
+ (unsigned int)tmp);
+ ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+ if (!lru_resize) {
+ CDEBUG(D_DLMTRACE,
+ "disable lru_resize for namespace %s\n",
+ ldlm_ns_name(ns));
+ ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+ }
+ } else {
+ CDEBUG(D_DLMTRACE,
+ "changing namespace %s max_unused from %u to %u\n",
+ ldlm_ns_name(ns), ns->ns_max_unused,
+ (unsigned int)tmp);
+ ns->ns_max_unused = (unsigned int)tmp;
+ ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+ /* Make sure that LRU resize was originally supported before
+ * turning it on here. */
+ if (lru_resize &&
+ (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+ CDEBUG(D_DLMTRACE,
+ "enable lru_resize for namespace %s\n",
+ ldlm_ns_name(ns));
+ ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+ }
+ }
+
+ return count;
+}
+LPROC_SEQ_FOPS(lprocfs_lru_size);
+
+static int lprocfs_elc_seq_show(struct seq_file *m, void *v)
+{
+ struct ldlm_namespace *ns = m->private;
+ unsigned int supp = ns_connect_cancelset(ns);
+
+ return lprocfs_rd_uint(m, &supp);
+}
+
+static ssize_t lprocfs_elc_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+ unsigned int supp = -1;
+ int rc;
+
+ rc = lprocfs_wr_uint(file, buffer, count, &supp);
+ if (rc < 0)
+ return rc;
+
+ if (supp == 0)
+ ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+ else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+ ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+ return count;
+}
+LPROC_SEQ_FOPS(lprocfs_elc);
+
+void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+{
+ if (ns->ns_proc_dir_entry == NULL)
+ CERROR("dlm namespace %s has no procfs dir?\n",
+ ldlm_ns_name(ns));
+ else
+ lprocfs_remove(&ns->ns_proc_dir_entry);
+
+ if (ns->ns_stats != NULL)
+ lprocfs_free_stats(&ns->ns_stats);
+}
+
+#define LDLM_NS_ADD_VAR(name, var, ops) \
+ do { \
+ snprintf(lock_name, MAX_STRING_SIZE, name); \
+ lock_vars[0].data = var; \
+ lock_vars[0].fops = ops; \
+ lprocfs_add_vars(ns_pde, lock_vars, NULL); \
+ } while (0)
+
+int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+{
+ struct lprocfs_vars lock_vars[2];
+ char lock_name[MAX_STRING_SIZE + 1];
+ struct proc_dir_entry *ns_pde;
+
+ LASSERT(ns != NULL);
+ LASSERT(ns->ns_rs_hash != NULL);
+
+ if (ns->ns_proc_dir_entry != NULL) {
+ ns_pde = ns->ns_proc_dir_entry;
+ } else {
+ ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
+ if (ns_pde == NULL)
+ return -ENOMEM;
+ ns->ns_proc_dir_entry = ns_pde;
+ }
+
+ ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+ if (ns->ns_stats == NULL)
+ return -ENOMEM;
+
+ lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+ LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+ lock_name[MAX_STRING_SIZE] = '\0';
+
+ memset(lock_vars, 0, sizeof(lock_vars));
+ lock_vars[0].name = lock_name;
+
+ LDLM_NS_ADD_VAR("resource_count", ns, &lprocfs_ns_resources_fops);
+ LDLM_NS_ADD_VAR("lock_count", ns, &lprocfs_ns_locks_fops);
+
+ if (ns_is_client(ns)) {
+ LDLM_NS_ADD_VAR("lock_unused_count", &ns->ns_nr_unused,
+ &ldlm_uint_fops);
+ LDLM_NS_ADD_VAR("lru_size", ns, &lprocfs_lru_size_fops);
+ LDLM_NS_ADD_VAR("lru_max_age", &ns->ns_max_age,
+ &ldlm_rw_uint_fops);
+ LDLM_NS_ADD_VAR("early_lock_cancel", ns, &lprocfs_elc_fops);
+ } else {
+ LDLM_NS_ADD_VAR("ctime_age_limit", &ns->ns_ctime_age_limit,
+ &ldlm_rw_uint_fops);
+ LDLM_NS_ADD_VAR("lock_timeouts", &ns->ns_timeouts,
+ &ldlm_uint_fops);
+ LDLM_NS_ADD_VAR("max_nolock_bytes", &ns->ns_max_nolock_size,
+ &ldlm_rw_uint_fops);
+ LDLM_NS_ADD_VAR("contention_seconds", &ns->ns_contention_time,
+ &ldlm_rw_uint_fops);
+ LDLM_NS_ADD_VAR("contended_locks", &ns->ns_contended_locks,
+ &ldlm_rw_uint_fops);
+ LDLM_NS_ADD_VAR("max_parallel_ast", &ns->ns_max_parallel_ast,
+ &ldlm_rw_uint_fops);
+ }
+ return 0;
+}
+#undef MAX_STRING_SIZE
+#else /* CONFIG_PROC_FS */
+
+#define ldlm_namespace_proc_unregister(ns) ({; })
+#define ldlm_namespace_proc_register(ns) ({0; })
+
+#endif /* CONFIG_PROC_FS */
+
+static unsigned ldlm_res_hop_hash(struct cfs_hash *hs,
+ const void *key, unsigned mask)
+{
+ const struct ldlm_res_id *id = key;
+ unsigned val = 0;
+ unsigned i;
+
+ for (i = 0; i < RES_NAME_SIZE; i++)
+ val += id->name[i];
+ return val & mask;
+}
+
+static unsigned ldlm_res_hop_fid_hash(struct cfs_hash *hs,
+ const void *key, unsigned mask)
+{
+ const struct ldlm_res_id *id = key;
+ struct lu_fid fid;
+ __u32 hash;
+ __u32 val;
+
+ fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+ fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+ fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+ hash = fid_flatten32(&fid);
+ hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+ if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+ val = id->name[LUSTRE_RES_ID_HSH_OFF];
+ hash += (val >> 5) + (val << 11);
+ } else {
+ val = fid_oid(&fid);
+ }
+ hash = hash_long(hash, hs->hs_bkt_bits);
+ /* give me another random factor */
+ hash -= hash_long((unsigned long)hs, val % 11 + 3);
+
+ hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+ hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1);
+
+ return hash & mask;
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+ struct ldlm_resource *res;
+
+ res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+ return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct ldlm_resource *res;
+
+ res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+ return ldlm_res_eq((const struct ldlm_res_id *)key,
+ (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void ldlm_res_hop_get_locked(struct cfs_hash *hs,
+ struct hlist_node *hnode)
+{
+ struct ldlm_resource *res;
+
+ res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+ ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put_locked(struct cfs_hash *hs,
+ struct hlist_node *hnode)
+{
+ struct ldlm_resource *res;
+
+ res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+ /* cfs_hash_for_each_nolock is the only chance we call it */
+ ldlm_resource_putref_locked(res);
+}
+
+static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ldlm_resource *res;
+
+ res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+ ldlm_resource_putref(res);
+}
+
+cfs_hash_ops_t ldlm_ns_hash_ops = {
+ .hs_hash = ldlm_res_hop_hash,
+ .hs_key = ldlm_res_hop_key,
+ .hs_keycmp = ldlm_res_hop_keycmp,
+ .hs_keycpy = NULL,
+ .hs_object = ldlm_res_hop_object,
+ .hs_get = ldlm_res_hop_get_locked,
+ .hs_put_locked = ldlm_res_hop_put_locked,
+ .hs_put = ldlm_res_hop_put
+};
+
+cfs_hash_ops_t ldlm_ns_fid_hash_ops = {
+ .hs_hash = ldlm_res_hop_fid_hash,
+ .hs_key = ldlm_res_hop_key,
+ .hs_keycmp = ldlm_res_hop_keycmp,
+ .hs_keycpy = NULL,
+ .hs_object = ldlm_res_hop_object,
+ .hs_get = ldlm_res_hop_get_locked,
+ .hs_put_locked = ldlm_res_hop_put_locked,
+ .hs_put = ldlm_res_hop_put
+};
+
+struct ldlm_ns_hash_def {
+ ldlm_ns_type_t nsd_type;
+ /** hash bucket bits */
+ unsigned nsd_bkt_bits;
+ /** hash bits */
+ unsigned nsd_all_bits;
+ /** hash operations */
+ cfs_hash_ops_t *nsd_hops;
+};
+
+struct ldlm_ns_hash_def ldlm_ns_hash_defs[] = {
+ {
+ .nsd_type = LDLM_NS_TYPE_MDC,
+ .nsd_bkt_bits = 11,
+ .nsd_all_bits = 16,
+ .nsd_hops = &ldlm_ns_fid_hash_ops,
+ },
+ {
+ .nsd_type = LDLM_NS_TYPE_MDT,
+ .nsd_bkt_bits = 14,
+ .nsd_all_bits = 21,
+ .nsd_hops = &ldlm_ns_fid_hash_ops,
+ },
+ {
+ .nsd_type = LDLM_NS_TYPE_OSC,
+ .nsd_bkt_bits = 8,
+ .nsd_all_bits = 12,
+ .nsd_hops = &ldlm_ns_hash_ops,
+ },
+ {
+ .nsd_type = LDLM_NS_TYPE_OST,
+ .nsd_bkt_bits = 11,
+ .nsd_all_bits = 17,
+ .nsd_hops = &ldlm_ns_hash_ops,
+ },
+ {
+ .nsd_type = LDLM_NS_TYPE_MGC,
+ .nsd_bkt_bits = 4,
+ .nsd_all_bits = 4,
+ .nsd_hops = &ldlm_ns_hash_ops,
+ },
+ {
+ .nsd_type = LDLM_NS_TYPE_MGT,
+ .nsd_bkt_bits = 4,
+ .nsd_all_bits = 4,
+ .nsd_hops = &ldlm_ns_hash_ops,
+ },
+ {
+ .nsd_type = LDLM_NS_TYPE_UNKNOWN,
+ },
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+ ldlm_side_t client,
+ ldlm_appetite_t apt,
+ ldlm_ns_type_t ns_type)
+{
+ struct ldlm_namespace *ns = NULL;
+ struct ldlm_ns_bucket *nsb;
+ struct ldlm_ns_hash_def *nsd;
+ struct cfs_hash_bd bd;
+ int idx;
+ int rc;
+
+ LASSERT(obd != NULL);
+
+ rc = ldlm_get_ref();
+ if (rc) {
+ CERROR("ldlm_get_ref failed: %d\n", rc);
+ return NULL;
+ }
+
+ for (idx = 0;; idx++) {
+ nsd = &ldlm_ns_hash_defs[idx];
+ if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) {
+ CERROR("Unknown type %d for ns %s\n", ns_type, name);
+ goto out_ref;
+ }
+
+ if (nsd->nsd_type == ns_type)
+ break;
+ }
+
+ OBD_ALLOC_PTR(ns);
+ if (!ns)
+ goto out_ref;
+
+ ns->ns_rs_hash = cfs_hash_create(name,
+ nsd->nsd_all_bits, nsd->nsd_all_bits,
+ nsd->nsd_bkt_bits, sizeof(*nsb),
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ nsd->nsd_hops,
+ CFS_HASH_DEPTH |
+ CFS_HASH_BIGNAME |
+ CFS_HASH_SPIN_BKTLOCK |
+ CFS_HASH_NO_ITEMREF);
+ if (ns->ns_rs_hash == NULL)
+ goto out_ns;
+
+ cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) {
+ nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+ at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+ nsb->nsb_namespace = ns;
+ }
+
+ ns->ns_obd = obd;
+ ns->ns_appetite = apt;
+ ns->ns_client = client;
+
+ INIT_LIST_HEAD(&ns->ns_list_chain);
+ INIT_LIST_HEAD(&ns->ns_unused_list);
+ spin_lock_init(&ns->ns_lock);
+ atomic_set(&ns->ns_bref, 0);
+ init_waitqueue_head(&ns->ns_waitq);
+
+ ns->ns_max_nolock_size = NS_DEFAULT_MAX_NOLOCK_BYTES;
+ ns->ns_contention_time = NS_DEFAULT_CONTENTION_SECONDS;
+ ns->ns_contended_locks = NS_DEFAULT_CONTENDED_LOCKS;
+
+ ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+ ns->ns_nr_unused = 0;
+ ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE;
+ ns->ns_max_age = LDLM_DEFAULT_MAX_ALIVE;
+ ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT;
+ ns->ns_timeouts = 0;
+ ns->ns_orig_connect_flags = 0;
+ ns->ns_connect_flags = 0;
+ ns->ns_stopping = 0;
+ rc = ldlm_namespace_proc_register(ns);
+ if (rc != 0) {
+ CERROR("Can't initialize ns proc, rc %d\n", rc);
+ goto out_hash;
+ }
+
+ idx = ldlm_namespace_nr_read(client);
+ rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+ if (rc) {
+ CERROR("Can't initialize lock pool, rc %d\n", rc);
+ goto out_proc;
+ }
+
+ ldlm_namespace_register(ns, client);
+ return ns;
+out_proc:
+ ldlm_namespace_proc_unregister(ns);
+ ldlm_namespace_cleanup(ns, 0);
+out_hash:
+ cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+ OBD_FREE_PTR(ns);
+out_ref:
+ ldlm_put_ref();
+ return NULL;
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up. This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+ __u64 flags)
+{
+ struct list_head *tmp;
+ int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+ bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+ do {
+ struct ldlm_lock *lock = NULL;
+
+ /* First, we look for non-cleaned-yet lock
+ * all cleaned locks are marked by CLEANED flag. */
+ lock_res(res);
+ list_for_each(tmp, q) {
+ lock = list_entry(tmp, struct ldlm_lock,
+ l_res_link);
+ if (lock->l_flags & LDLM_FL_CLEANED) {
+ lock = NULL;
+ continue;
+ }
+ LDLM_LOCK_GET(lock);
+ lock->l_flags |= LDLM_FL_CLEANED;
+ break;
+ }
+
+ if (lock == NULL) {
+ unlock_res(res);
+ break;
+ }
+
+ /* Set CBPENDING so nothing in the cancellation path
+ * can match this lock. */
+ lock->l_flags |= LDLM_FL_CBPENDING;
+ lock->l_flags |= LDLM_FL_FAILED;
+ lock->l_flags |= flags;
+
+ /* ... without sending a CANCEL message for local_only. */
+ if (local_only)
+ lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+
+ if (local_only && (lock->l_readers || lock->l_writers)) {
+ /* This is a little bit gross, but much better than the
+ * alternative: pretend that we got a blocking AST from
+ * the server, so that when the lock is decref'd, it
+ * will go away ... */
+ unlock_res(res);
+ LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+ if (lock->l_completion_ast)
+ lock->l_completion_ast(lock, 0, NULL);
+ LDLM_LOCK_RELEASE(lock);
+ continue;
+ }
+
+ if (client) {
+ struct lustre_handle lockh;
+
+ unlock_res(res);
+ ldlm_lock2handle(lock, &lockh);
+ rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+ if (rc)
+ CERROR("ldlm_cli_cancel: %d\n", rc);
+ } else {
+ ldlm_resource_unlink_lock(lock);
+ unlock_res(res);
+ LDLM_DEBUG(lock, "Freeing a lock still held by a client node");
+ ldlm_lock_destroy(lock);
+ }
+ LDLM_LOCK_RELEASE(lock);
+ } while (1);
+}
+
+static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg)
+{
+ struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+ __u64 flags = *(__u64 *)arg;
+
+ cleanup_resource(res, &res->lr_granted, flags);
+ cleanup_resource(res, &res->lr_converting, flags);
+ cleanup_resource(res, &res->lr_waiting, flags);
+
+ return 0;
+}
+
+static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg)
+{
+ struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+ lock_res(res);
+ CERROR("%s: namespace resource "DLDLMRES
+ " (%p) refcount nonzero (%d) after lock cleanup; forcing cleanup.\n",
+ ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res,
+ atomic_read(&res->lr_refcount) - 1);
+
+ ldlm_resource_dump(D_ERROR, res);
+ unlock_res(res);
+ return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+ if (ns == NULL) {
+ CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+ return ELDLM_OK;
+ }
+
+ cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags);
+ cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL);
+ return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+ /* At shutdown time, don't call the cancellation callback */
+ ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+ if (atomic_read(&ns->ns_bref) > 0) {
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+ int rc;
+
+ CDEBUG(D_DLMTRACE,
+ "dlm namespace %s free waiting on refcount %d\n",
+ ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+ if (force)
+ lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+
+ rc = l_wait_event(ns->ns_waitq,
+ atomic_read(&ns->ns_bref) == 0, &lwi);
+
+ /* Forced cleanups should be able to reclaim all references,
+ * so it's safe to wait forever... we can't leak locks... */
+ if (force && rc == -ETIMEDOUT) {
+ LCONSOLE_ERROR("Forced cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n",
+ ldlm_ns_name(ns),
+ atomic_read(&ns->ns_bref), rc);
+ goto force_wait;
+ }
+
+ if (atomic_read(&ns->ns_bref)) {
+ LCONSOLE_ERROR("Cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n",
+ ldlm_ns_name(ns),
+ atomic_read(&ns->ns_bref), rc);
+ return ELDLM_NAMESPACE_EXISTS;
+ }
+ CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+ ldlm_ns_name(ns));
+ }
+
+ return ELDLM_OK;
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+ struct obd_import *imp,
+ int force)
+{
+ int rc;
+
+ if (!ns)
+ return;
+
+ spin_lock(&ns->ns_lock);
+ ns->ns_stopping = 1;
+ spin_unlock(&ns->ns_lock);
+
+ /*
+ * Can fail with -EINTR when force == 0 in which case try harder.
+ */
+ rc = __ldlm_namespace_free(ns, force);
+ if (rc != ELDLM_OK) {
+ if (imp) {
+ ptlrpc_disconnect_import(imp, 0);
+ ptlrpc_invalidate_import(imp);
+ }
+
+ /*
+ * With all requests dropped and the import inactive
+ * we are guaranteed all reference will be dropped.
+ */
+ rc = __ldlm_namespace_free(ns, 1);
+ LASSERT(rc == 0);
+ }
+}
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+ if (!ns)
+ return;
+
+ /* Make sure that nobody can find this ns in its list. */
+ ldlm_namespace_unregister(ns, ns->ns_client);
+ /* Fini pool _before_ parent proc dir is removed. This is important as
+ * ldlm_pool_fini() removes own proc dir which is child to @dir.
+ * Removing it after @dir may cause oops. */
+ ldlm_pool_fini(&ns->ns_pool);
+
+ ldlm_namespace_proc_unregister(ns);
+ cfs_hash_putref(ns->ns_rs_hash);
+ /* Namespace \a ns should be not on list at this time, otherwise
+ * this will cause issues related to using freed \a ns in poold
+ * thread. */
+ LASSERT(list_empty(&ns->ns_list_chain));
+ OBD_FREE_PTR(ns);
+ ldlm_put_ref();
+}
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ * class_disconnect_export(grab cl_sem) ->
+ * -> ldlm_namespace_free ->
+ * -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ * lprocfs_fops_read(grab _lprocfs_lock) ->
+ * -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+ struct obd_import *imp,
+ int force)
+{
+ ldlm_namespace_free_prior(ns, imp, force);
+ ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+ atomic_inc(&ns->ns_bref);
+}
+EXPORT_SYMBOL(ldlm_namespace_get);
+
+/* This is only for callers that care about refcount */
+int ldlm_namespace_get_return(struct ldlm_namespace *ns)
+{
+ return atomic_inc_return(&ns->ns_bref);
+}
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+ if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+ wake_up(&ns->ns_waitq);
+ spin_unlock(&ns->ns_lock);
+ }
+}
+EXPORT_SYMBOL(ldlm_namespace_put);
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+ mutex_lock(ldlm_namespace_lock(client));
+ LASSERT(list_empty(&ns->ns_list_chain));
+ list_add(&ns->ns_list_chain, ldlm_namespace_inactive_list(client));
+ ldlm_namespace_nr_inc(client);
+ mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+ mutex_lock(ldlm_namespace_lock(client));
+ LASSERT(!list_empty(&ns->ns_list_chain));
+ /* Some asserts and possibly other parts of the code are still
+ * using list_empty(&ns->ns_list_chain). This is why it is
+ * important to use list_del_init() here. */
+ list_del_init(&ns->ns_list_chain);
+ ldlm_namespace_nr_dec(client);
+ mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns,
+ ldlm_side_t client)
+{
+ LASSERT(!list_empty(&ns->ns_list_chain));
+ LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+ list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns,
+ ldlm_side_t client)
+{
+ LASSERT(!list_empty(&ns->ns_list_chain));
+ LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+ list_move_tail(&ns->ns_list_chain,
+ ldlm_namespace_inactive_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client)
+{
+ LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+ LASSERT(!list_empty(ldlm_namespace_list(client)));
+ return container_of(ldlm_namespace_list(client)->next,
+ struct ldlm_namespace, ns_list_chain);
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(void)
+{
+ struct ldlm_resource *res;
+ int idx;
+
+ OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, GFP_NOFS);
+ if (res == NULL)
+ return NULL;
+
+ INIT_LIST_HEAD(&res->lr_granted);
+ INIT_LIST_HEAD(&res->lr_converting);
+ INIT_LIST_HEAD(&res->lr_waiting);
+
+ /* Initialize interval trees for each lock mode. */
+ for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+ res->lr_itree[idx].lit_size = 0;
+ res->lr_itree[idx].lit_mode = 1 << idx;
+ res->lr_itree[idx].lit_root = NULL;
+ }
+
+ atomic_set(&res->lr_refcount, 1);
+ spin_lock_init(&res->lr_lock);
+ lu_ref_init(&res->lr_reference);
+
+ /* The creator of the resource must unlock the mutex after LVB
+ * initialization. */
+ mutex_init(&res->lr_lvb_mutex);
+ mutex_lock(&res->lr_lvb_mutex);
+
+ return res;
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or NULL
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+ const struct ldlm_res_id *name, ldlm_type_t type, int create)
+{
+ struct hlist_node *hnode;
+ struct ldlm_resource *res;
+ struct cfs_hash_bd bd;
+ __u64 version;
+ int ns_refcount = 0;
+
+ LASSERT(ns != NULL);
+ LASSERT(parent == NULL);
+ LASSERT(ns->ns_rs_hash != NULL);
+ LASSERT(name->name[0] != 0);
+
+ cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+ hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+ if (hnode != NULL) {
+ cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+ res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+ /* Synchronize with regard to resource creation. */
+ if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+ mutex_lock(&res->lr_lvb_mutex);
+ mutex_unlock(&res->lr_lvb_mutex);
+ }
+
+ if (unlikely(res->lr_lvb_len < 0)) {
+ ldlm_resource_putref(res);
+ res = NULL;
+ }
+ return res;
+ }
+
+ version = cfs_hash_bd_version_get(&bd);
+ cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+ if (create == 0)
+ return NULL;
+
+ LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+ "type: %d\n", type);
+ res = ldlm_resource_new();
+ if (!res)
+ return NULL;
+
+ res->lr_ns_bucket = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+ res->lr_name = *name;
+ res->lr_type = type;
+ res->lr_most_restr = LCK_NL;
+
+ cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+ hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL :
+ cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+ if (hnode != NULL) {
+ /* Someone won the race and already added the resource. */
+ cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+ /* Clean lu_ref for failed resource. */
+ lu_ref_fini(&res->lr_reference);
+ /* We have taken lr_lvb_mutex. Drop it. */
+ mutex_unlock(&res->lr_lvb_mutex);
+ OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res));
+
+ res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+ /* Synchronize with regard to resource creation. */
+ if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+ mutex_lock(&res->lr_lvb_mutex);
+ mutex_unlock(&res->lr_lvb_mutex);
+ }
+
+ if (unlikely(res->lr_lvb_len < 0)) {
+ ldlm_resource_putref(res);
+ res = NULL;
+ }
+ return res;
+ }
+ /* We won! Let's add the resource. */
+ cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+ if (cfs_hash_bd_count_get(&bd) == 1)
+ ns_refcount = ldlm_namespace_get_return(ns);
+
+ cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+ if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+ int rc;
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+ rc = ns->ns_lvbo->lvbo_init(res);
+ if (rc < 0) {
+ CERROR("%s: lvbo_init failed for resource %#llx:%#llx: rc = %d\n",
+ ns->ns_obd->obd_name, name->name[0],
+ name->name[1], rc);
+ if (res->lr_lvb_data) {
+ OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+ res->lr_lvb_data = NULL;
+ }
+ res->lr_lvb_len = rc;
+ mutex_unlock(&res->lr_lvb_mutex);
+ ldlm_resource_putref(res);
+ return NULL;
+ }
+ }
+
+ /* We create resource with locked lr_lvb_mutex. */
+ mutex_unlock(&res->lr_lvb_mutex);
+
+ /* Let's see if we happened to be the very first resource in this
+ * namespace. If so, and this is a client namespace, we need to move
+ * the namespace into the active namespaces list to be patrolled by
+ * the ldlm_poold. */
+ if (ns_is_client(ns) && ns_refcount == 1) {
+ mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+ ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT);
+ mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+ }
+
+ return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+ LASSERT(res != NULL);
+ LASSERT(res != LP_POISON);
+ atomic_inc(&res->lr_refcount);
+ CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+ atomic_read(&res->lr_refcount));
+ return res;
+}
+
+static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd,
+ struct ldlm_resource *res)
+{
+ struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+ if (!list_empty(&res->lr_granted)) {
+ ldlm_resource_dump(D_ERROR, res);
+ LBUG();
+ }
+
+ if (!list_empty(&res->lr_converting)) {
+ ldlm_resource_dump(D_ERROR, res);
+ LBUG();
+ }
+
+ if (!list_empty(&res->lr_waiting)) {
+ ldlm_resource_dump(D_ERROR, res);
+ LBUG();
+ }
+
+ cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+ bd, &res->lr_hash);
+ lu_ref_fini(&res->lr_reference);
+ if (cfs_hash_bd_count_get(bd) == 0)
+ ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+ struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+ struct cfs_hash_bd bd;
+
+ LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+ CDEBUG(D_INFO, "putref res: %p count: %d\n",
+ res, atomic_read(&res->lr_refcount) - 1);
+
+ cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+ if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+ __ldlm_resource_putref_final(&bd, res);
+ cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+ if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+ ns->ns_lvbo->lvbo_free(res);
+ OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res));
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref_locked(struct ldlm_resource *res)
+{
+ struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+ LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+ CDEBUG(D_INFO, "putref res: %p count: %d\n",
+ res, atomic_read(&res->lr_refcount) - 1);
+
+ if (atomic_dec_and_test(&res->lr_refcount)) {
+ struct cfs_hash_bd bd;
+
+ cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash,
+ &res->lr_name, &bd);
+ __ldlm_resource_putref_final(&bd, res);
+ cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+ /* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF,
+ * so we should never be here while calling cfs_hash_del,
+ * cfs_hash_for_each_nolock is the only case we can get
+ * here, which is safe to release cfs_hash_bd_lock.
+ */
+ if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+ ns->ns_lvbo->lvbo_free(res);
+ OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof(*res));
+
+ cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+ struct ldlm_lock *lock)
+{
+ check_res_locked(res);
+
+ LDLM_DEBUG(lock, "About to add this lock:\n");
+
+ if (lock->l_flags & LDLM_FL_DESTROYED) {
+ CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+ return;
+ }
+
+ LASSERT(list_empty(&lock->l_res_link));
+
+ list_add_tail(&lock->l_res_link, head);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ *
+ * Obtain resource description from the lock we are inserting after.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+ struct ldlm_lock *new)
+{
+ struct ldlm_resource *res = original->l_resource;
+
+ check_res_locked(res);
+
+ ldlm_resource_dump(D_INFO, res);
+ LDLM_DEBUG(new, "About to insert this lock after %p:\n", original);
+
+ if (new->l_flags & LDLM_FL_DESTROYED) {
+ CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+ goto out;
+ }
+
+ LASSERT(list_empty(&new->l_res_link));
+
+ list_add(&new->l_res_link, &original->l_res_link);
+ out:;
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+ int type = lock->l_resource->lr_type;
+
+ check_res_locked(lock->l_resource);
+ if (type == LDLM_IBITS || type == LDLM_PLAIN)
+ ldlm_unlink_lock_skiplist(lock);
+ else if (type == LDLM_EXTENT)
+ ldlm_extent_unlink_lock(lock);
+ list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+ desc->lr_type = res->lr_type;
+ desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level)
+{
+ struct list_head *tmp;
+
+ if (!((libcfs_debug | D_ERROR) & level))
+ return;
+
+ mutex_lock(ldlm_namespace_lock(client));
+
+ list_for_each(tmp, ldlm_namespace_list(client)) {
+ struct ldlm_namespace *ns;
+
+ ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+ ldlm_namespace_dump(level, ns);
+ }
+
+ mutex_unlock(ldlm_namespace_lock(client));
+}
+EXPORT_SYMBOL(ldlm_dump_all_namespaces);
+
+static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg)
+{
+ struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+ int level = (int)(unsigned long)arg;
+
+ lock_res(res);
+ ldlm_resource_dump(level, res);
+ unlock_res(res);
+
+ return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+ if (!((libcfs_debug | D_ERROR) & level))
+ return;
+
+ CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+ ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+ ns_is_client(ns) ? "client" : "server");
+
+ if (time_before(cfs_time_current(), ns->ns_next_dump))
+ return;
+
+ cfs_hash_for_each_nolock(ns->ns_rs_hash,
+ ldlm_res_hash_dump,
+ (void *)(unsigned long)level);
+ spin_lock(&ns->ns_lock);
+ ns->ns_next_dump = cfs_time_shift(10);
+ spin_unlock(&ns->ns_lock);
+}
+EXPORT_SYMBOL(ldlm_namespace_dump);
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+ struct ldlm_lock *lock;
+ unsigned int granted = 0;
+
+ CLASSERT(RES_NAME_SIZE == 4);
+
+ if (!((libcfs_debug | D_ERROR) & level))
+ return;
+
+ CDEBUG(level, "--- Resource: "DLDLMRES" (%p) refcount = %d\n",
+ PLDLMRES(res), res, atomic_read(&res->lr_refcount));
+
+ if (!list_empty(&res->lr_granted)) {
+ CDEBUG(level, "Granted locks (in reverse order):\n");
+ list_for_each_entry_reverse(lock, &res->lr_granted,
+ l_res_link) {
+ LDLM_DEBUG_LIMIT(level, lock, "###");
+ if (!(level & D_CANTMASK) &&
+ ++granted > ldlm_dump_granted_max) {
+ CDEBUG(level, "only dump %d granted locks to avoid DDOS.\n",
+ granted);
+ break;
+ }
+ }
+ }
+ if (!list_empty(&res->lr_converting)) {
+ CDEBUG(level, "Converting locks:\n");
+ list_for_each_entry(lock, &res->lr_converting, l_res_link)
+ LDLM_DEBUG_LIMIT(level, lock, "###");
+ }
+ if (!list_empty(&res->lr_waiting)) {
+ CDEBUG(level, "Waiting locks:\n");
+ list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+ LDLM_DEBUG_LIMIT(level, lock, "###");
+ }
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/Makefile b/drivers/staging/lustre/lustre/libcfs/Makefile
new file mode 100644
index 000000000..2996a48a3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs.o
+
+libcfs-linux-objs := linux-tracefile.o linux-debug.o
+libcfs-linux-objs += linux-prim.o linux-cpu.o
+libcfs-linux-objs += linux-tcpip.o
+libcfs-linux-objs += linux-curproc.o
+libcfs-linux-objs += linux-module.o
+libcfs-linux-objs += linux-crypto.o
+libcfs-linux-objs += linux-crypto-adler.o
+
+libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
+ libcfs_string.o hash.o kernel_user_comm.o \
+ prng.o workitem.o libcfs_cpu.o \
+ libcfs_mem.o libcfs_lock.o
+
+libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
diff --git a/drivers/staging/lustre/lustre/libcfs/debug.c b/drivers/staging/lustre/lustre/libcfs/debug.c
new file mode 100644
index 000000000..021c92fa0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/debug.c
@@ -0,0 +1,460 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+module_param(libcfs_subsystem_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+ D_NETERROR | D_HA | D_CONFIG | D_IOCTL);
+module_param(libcfs_debug, int, 0644);
+MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+static unsigned int libcfs_debug_mb;
+module_param(libcfs_debug_mb, uint, 0644);
+MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size.");
+EXPORT_SYMBOL(libcfs_debug_mb);
+
+unsigned int libcfs_printk = D_CANTMASK;
+module_param(libcfs_printk, uint, 0644);
+MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask");
+EXPORT_SYMBOL(libcfs_printk);
+
+unsigned int libcfs_console_ratelimit = 1;
+module_param(libcfs_console_ratelimit, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)");
+EXPORT_SYMBOL(libcfs_console_ratelimit);
+
+unsigned int libcfs_console_max_delay;
+module_param(libcfs_console_max_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_max_delay);
+
+unsigned int libcfs_console_min_delay;
+module_param(libcfs_console_min_delay, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_min_delay);
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+module_param(libcfs_console_backoff, uint, 0644);
+MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor");
+EXPORT_SYMBOL(libcfs_console_backoff);
+
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+static unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+module_param(libcfs_panic_on_lbug, uint, 0644);
+MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG");
+EXPORT_SYMBOL(libcfs_panic_on_lbug);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+static char *libcfs_debug_file_path;
+module_param(libcfs_debug_file_path, charp, 0644);
+MODULE_PARM_DESC(libcfs_debug_file_path,
+ "Path for dumping debug logs, set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *
+libcfs_debug_subsys2str(int subsys)
+{
+ switch (1 << subsys) {
+ default:
+ return NULL;
+ case S_UNDEFINED:
+ return "undefined";
+ case S_MDC:
+ return "mdc";
+ case S_MDS:
+ return "mds";
+ case S_OSC:
+ return "osc";
+ case S_OST:
+ return "ost";
+ case S_CLASS:
+ return "class";
+ case S_LOG:
+ return "log";
+ case S_LLITE:
+ return "llite";
+ case S_RPC:
+ return "rpc";
+ case S_LNET:
+ return "lnet";
+ case S_LND:
+ return "lnd";
+ case S_PINGER:
+ return "pinger";
+ case S_FILTER:
+ return "filter";
+ case S_ECHO:
+ return "echo";
+ case S_LDLM:
+ return "ldlm";
+ case S_LOV:
+ return "lov";
+ case S_LQUOTA:
+ return "lquota";
+ case S_OSD:
+ return "osd";
+ case S_LMV:
+ return "lmv";
+ case S_SEC:
+ return "sec";
+ case S_GSS:
+ return "gss";
+ case S_MGC:
+ return "mgc";
+ case S_MGS:
+ return "mgs";
+ case S_FID:
+ return "fid";
+ case S_FLD:
+ return "fld";
+ }
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+static const char *
+libcfs_debug_dbg2str(int debug)
+{
+ switch (1 << debug) {
+ default:
+ return NULL;
+ case D_TRACE:
+ return "trace";
+ case D_INODE:
+ return "inode";
+ case D_SUPER:
+ return "super";
+ case D_EXT2:
+ return "ext2";
+ case D_MALLOC:
+ return "malloc";
+ case D_CACHE:
+ return "cache";
+ case D_INFO:
+ return "info";
+ case D_IOCTL:
+ return "ioctl";
+ case D_NETERROR:
+ return "neterror";
+ case D_NET:
+ return "net";
+ case D_WARNING:
+ return "warning";
+ case D_BUFFS:
+ return "buffs";
+ case D_OTHER:
+ return "other";
+ case D_DENTRY:
+ return "dentry";
+ case D_NETTRACE:
+ return "nettrace";
+ case D_PAGE:
+ return "page";
+ case D_DLMTRACE:
+ return "dlmtrace";
+ case D_ERROR:
+ return "error";
+ case D_EMERG:
+ return "emerg";
+ case D_HA:
+ return "ha";
+ case D_RPCTRACE:
+ return "rpctrace";
+ case D_VFSTRACE:
+ return "vfstrace";
+ case D_READA:
+ return "reada";
+ case D_MMAP:
+ return "mmap";
+ case D_CONFIG:
+ return "config";
+ case D_CONSOLE:
+ return "console";
+ case D_QUOTA:
+ return "quota";
+ case D_SEC:
+ return "sec";
+ case D_LFSCK:
+ return "lfsck";
+ }
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+ const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+ libcfs_debug_dbg2str;
+ int len = 0;
+ const char *token;
+ int i;
+
+ if (mask == 0) { /* "0" */
+ if (size > 0)
+ str[0] = '0';
+ len = 1;
+ } else { /* space-separated tokens */
+ for (i = 0; i < 32; i++) {
+ if ((mask & (1 << i)) == 0)
+ continue;
+
+ token = fn(i);
+ if (token == NULL) /* unused bit */
+ continue;
+
+ if (len > 0) { /* separator? */
+ if (len < size)
+ str[len] = ' ';
+ len++;
+ }
+
+ while (*token != 0) {
+ if (len < size)
+ str[len] = *token;
+ token++;
+ len++;
+ }
+ }
+ }
+
+ /* terminate 'str' */
+ if (len < size)
+ str[len] = 0;
+ else
+ str[size - 1] = 0;
+
+ return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+ const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+ libcfs_debug_dbg2str;
+ int m = 0;
+ int matched;
+ int n;
+ int t;
+
+ /* Allow a number for backwards compatibility */
+
+ for (n = strlen(str); n > 0; n--)
+ if (!isspace(str[n-1]))
+ break;
+ matched = n;
+ t = sscanf(str, "%i%n", &m, &matched);
+ if (t >= 1 && matched == n) {
+ /* don't print warning for lctl set_param debug=0 or -1 */
+ if (m != 0 && m != -1)
+ CWARN("You are trying to use a numerical value for the mask - this will be deprecated in a future release.\n");
+ *mask = m;
+ return 0;
+ }
+
+ return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+ 0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+ void *journal_info;
+
+ journal_info = current->journal_info;
+ current->journal_info = NULL;
+
+ if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) {
+ snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+ "%s.%ld.%ld", libcfs_debug_file_path_arr,
+ get_seconds(), (long_ptr_t)arg);
+ pr_alert("LustreError: dumping log to %s\n",
+ debug_file_name);
+ cfs_tracefile_dump_all_pages(debug_file_name);
+ libcfs_run_debug_log_upcall(debug_file_name);
+ }
+
+ current->journal_info = journal_info;
+}
+
+static int libcfs_debug_dumplog_thread(void *arg)
+{
+ libcfs_debug_dumplog_internal(arg);
+ wake_up(&debug_ctlwq);
+ return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+ wait_queue_t wait;
+ struct task_struct *dumper;
+
+ /* we're being careful to ensure that the kernel thread is
+ * able to set our state to running as it exits before we
+ * get to schedule() */
+ init_waitqueue_entry(&wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&debug_ctlwq, &wait);
+
+ dumper = kthread_run(libcfs_debug_dumplog_thread,
+ (void *)(long)current_pid(),
+ "libcfs_debug_dumper");
+ if (IS_ERR(dumper))
+ pr_err("LustreError: cannot start log dump thread: %ld\n",
+ PTR_ERR(dumper));
+ else
+ schedule();
+
+ /* be sure to teardown if cfs_create_thread() failed */
+ remove_wait_queue(&debug_ctlwq, &wait);
+ set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+ int rc = 0;
+ unsigned int max = libcfs_debug_mb;
+
+ init_waitqueue_head(&debug_ctlwq);
+
+ if (libcfs_console_max_delay <= 0 || /* not set by user or */
+ libcfs_console_min_delay <= 0 || /* set to invalid values */
+ libcfs_console_min_delay >= libcfs_console_max_delay) {
+ libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+ libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+ }
+
+ if (libcfs_debug_file_path != NULL) {
+ strncpy(libcfs_debug_file_path_arr,
+ libcfs_debug_file_path, PATH_MAX-1);
+ libcfs_debug_file_path_arr[PATH_MAX - 1] = '\0';
+ }
+
+ /* If libcfs_debug_mb is set to an invalid value or uninitialized
+ * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+ if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+ max = TCD_MAX_PAGES;
+ } else {
+ max = max / num_possible_cpus();
+ max <<= (20 - PAGE_CACHE_SHIFT);
+ }
+ rc = cfs_tracefile_init(max);
+
+ if (rc == 0)
+ libcfs_register_panic_notifier();
+
+ return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+ libcfs_unregister_panic_notifier();
+ cfs_tracefile_exit();
+ return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+ cfs_trace_flush_pages();
+ return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+ CDEBUG(D_TRACE,
+ "***************************************************\n");
+ LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+ CDEBUG(D_TRACE,
+ "***************************************************\n");
+
+ return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+void libcfs_debug_set_level(unsigned int debug_level)
+{
+ pr_warn("Lustre: Setting portals debug level to %08x\n",
+ debug_level);
+ libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_set_level);
diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c
new file mode 100644
index 000000000..92444b0fe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/fail.c
@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+unsigned long cfs_fail_loc = 0;
+EXPORT_SYMBOL(cfs_fail_loc);
+
+unsigned int cfs_fail_val = 0;
+EXPORT_SYMBOL(cfs_fail_val);
+
+wait_queue_head_t cfs_race_waitq;
+EXPORT_SYMBOL(cfs_race_waitq);
+
+int cfs_race_state;
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+ static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+ LASSERT(!(id & CFS_FAIL_ONCE));
+
+ if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+ (CFS_FAILED | CFS_FAIL_ONCE)) {
+ atomic_set(&cfs_fail_count, 0); /* paranoia */
+ return 0;
+ }
+
+ /* Fail 1/cfs_fail_val times */
+ if (cfs_fail_loc & CFS_FAIL_RAND) {
+ if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+ return 0;
+ }
+
+ /* Skip the first cfs_fail_val, then fail */
+ if (cfs_fail_loc & CFS_FAIL_SKIP) {
+ if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+ return 0;
+ }
+
+ /* check cfs_fail_val... */
+ if (set == CFS_FAIL_LOC_VALUE) {
+ if (cfs_fail_val != -1 && cfs_fail_val != value)
+ return 0;
+ }
+
+ /* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+ if (cfs_fail_loc & CFS_FAIL_SOME &&
+ (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+ int count = atomic_inc_return(&cfs_fail_count);
+
+ if (count >= cfs_fail_val) {
+ set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+ atomic_set(&cfs_fail_count, 0);
+ /* we are lost race to increase */
+ if (count > cfs_fail_val)
+ return 0;
+ }
+ }
+
+ if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) &&
+ (value & CFS_FAIL_ONCE))
+ set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+ /* Lost race to set CFS_FAILED_BIT. */
+ if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+ /* If CFS_FAIL_ONCE is valid, only one process can fail,
+ * otherwise multi-process can fail at the same time. */
+ if (cfs_fail_loc & CFS_FAIL_ONCE)
+ return 0;
+ }
+
+ switch (set) {
+ case CFS_FAIL_LOC_NOSET:
+ case CFS_FAIL_LOC_VALUE:
+ break;
+ case CFS_FAIL_LOC_ORSET:
+ cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+ break;
+ case CFS_FAIL_LOC_RESET:
+ cfs_fail_loc = value;
+ break;
+ default:
+ LASSERTF(0, "called with bad set %u\n", set);
+ break;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+ int ret = 0;
+
+ ret = __cfs_fail_check_set(id, value, set);
+ if (ret) {
+ CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+ id, ms);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(ms) / 1000);
+ CERROR("cfs_fail_timeout id %x awake\n", id);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c
new file mode 100644
index 000000000..a55567e0d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/hash.c
@@ -0,0 +1,2098 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ * * CFS_HASH_DEBUG additional validation
+ * * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(instead of contiguous memory),
+ * to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can specify bucket size
+ * by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ * one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ * avoid lock overhead for hash tables that are already protected
+ * by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ * overhead of spinlock contention is lower than read/write
+ * contention of rwlock, so using spinlock to serialize operations on
+ * bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ * one lock to protect all hash operations to avoid overhead of
+ * multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ * addref & decref are atomic operations in many use-cases which
+ * are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ * some lustre use-cases require these functions to be strictly
+ * non-blocking, we need to schedule required rehash on a different
+ * thread on those cases.
+ *
+ * - safer rehash on large hash table
+ * In old implementation, rehash function will exclusively lock the
+ * hash table and finish rehash in one batch, it's dangerous on SMP
+ * system because rehash millions of elements could take long time.
+ * New implemented rehash can release lock and relax CPU in middle
+ * of rehash, it's safe for another thread to search/change on the
+ * hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ * . hash table has refcount on element
+ * . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ * in old implementation, cfs_hash_rehash_key could screw up the
+ * hash-table because @key is overwritten without any protection.
+ * Now we need user to define hs_keycpy for those rehash enabled
+ * hash tables, cfs_hash_rehash_key will overwrite hash-key
+ * inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ * Now we support both locked iteration & lockless iteration of hash
+ * table. Also, user can break the iteration by return 1 in callback.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/seq_file.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+module_param(warn_on_depth, uint, 0644);
+MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive)
+ __acquires(&lock->spin)
+{
+ spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive)
+ __releases(&lock->spin)
+{
+ spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive)
+ __acquires(&lock->rw)
+{
+ if (!exclusive)
+ read_lock(&lock->rw);
+ else
+ write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive)
+ __releases(&lock->rw)
+{
+ if (!exclusive)
+ read_unlock(&lock->rw);
+ else
+ write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static cfs_hash_lock_ops_t cfs_hash_nl_lops = {
+ .hs_lock = cfs_hash_nl_lock,
+ .hs_unlock = cfs_hash_nl_unlock,
+ .hs_bkt_lock = cfs_hash_nl_lock,
+ .hs_bkt_unlock = cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static cfs_hash_lock_ops_t cfs_hash_nbl_lops = {
+ .hs_lock = cfs_hash_spin_lock,
+ .hs_unlock = cfs_hash_spin_unlock,
+ .hs_bkt_lock = cfs_hash_nl_lock,
+ .hs_bkt_unlock = cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops = {
+ .hs_lock = cfs_hash_rw_lock,
+ .hs_unlock = cfs_hash_rw_unlock,
+ .hs_bkt_lock = cfs_hash_spin_lock,
+ .hs_bkt_unlock = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops = {
+ .hs_lock = cfs_hash_rw_lock,
+ .hs_unlock = cfs_hash_rw_unlock,
+ .hs_bkt_lock = cfs_hash_rw_lock,
+ .hs_bkt_unlock = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops = {
+ .hs_lock = cfs_hash_nl_lock,
+ .hs_unlock = cfs_hash_nl_unlock,
+ .hs_bkt_lock = cfs_hash_spin_lock,
+ .hs_bkt_unlock = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops = {
+ .hs_lock = cfs_hash_nl_lock,
+ .hs_unlock = cfs_hash_nl_unlock,
+ .hs_bkt_lock = cfs_hash_rw_lock,
+ .hs_bkt_unlock = cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(struct cfs_hash *hs)
+{
+ if (cfs_hash_with_no_lock(hs)) {
+ hs->hs_lops = &cfs_hash_nl_lops;
+
+ } else if (cfs_hash_with_no_bktlock(hs)) {
+ hs->hs_lops = &cfs_hash_nbl_lops;
+ spin_lock_init(&hs->hs_lock.spin);
+
+ } else if (cfs_hash_with_rehash(hs)) {
+ rwlock_init(&hs->hs_lock.rw);
+
+ if (cfs_hash_with_rw_bktlock(hs))
+ hs->hs_lops = &cfs_hash_bkt_rw_lops;
+ else if (cfs_hash_with_spin_bktlock(hs))
+ hs->hs_lops = &cfs_hash_bkt_spin_lops;
+ else
+ LBUG();
+ } else {
+ if (cfs_hash_with_rw_bktlock(hs))
+ hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+ else if (cfs_hash_with_spin_bktlock(hs))
+ hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+ else
+ LBUG();
+ }
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+ struct hlist_head hh_head; /**< entries list */
+} cfs_hash_head_t;
+
+static int
+cfs_hash_hh_hhead_size(struct cfs_hash *hs)
+{
+ return sizeof(cfs_hash_head_t);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+ cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0];
+
+ return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+ return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ hlist_del_init(hnode);
+ return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+ struct hlist_head hd_head; /**< entries list */
+ unsigned int hd_depth; /**< list length */
+} cfs_hash_head_dep_t;
+
+static int
+cfs_hash_hd_hhead_size(struct cfs_hash *hs)
+{
+ return sizeof(cfs_hash_head_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+ cfs_hash_head_dep_t *head;
+
+ head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0];
+ return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+ cfs_hash_head_dep_t, hd_head);
+ hlist_add_head(hnode, &hh->hd_head);
+ return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+ cfs_hash_head_dep_t, hd_head);
+ hlist_del_init(hnode);
+ return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+ struct hlist_head dh_head; /**< entries list */
+ struct hlist_node *dh_tail; /**< the last entry */
+} cfs_hash_dhead_t;
+
+static int
+cfs_hash_dh_hhead_size(struct cfs_hash *hs)
+{
+ return sizeof(cfs_hash_dhead_t);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+ cfs_hash_dhead_t *head;
+
+ head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0];
+ return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+ cfs_hash_dhead_t, dh_head);
+
+ if (dh->dh_tail != NULL) /* not empty */
+ hlist_add_behind(hnode, dh->dh_tail);
+ else /* empty list */
+ hlist_add_head(hnode, &dh->dh_head);
+ dh->dh_tail = hnode;
+ return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnd)
+{
+ cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+ cfs_hash_dhead_t, dh_head);
+
+ if (hnd->next == NULL) { /* it's the tail */
+ dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+ container_of(hnd->pprev, struct hlist_node, next);
+ }
+ hlist_del_init(hnd);
+ return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+ struct hlist_head dd_head; /**< entries list */
+ struct hlist_node *dd_tail; /**< the last entry */
+ unsigned int dd_depth; /**< list length */
+} cfs_hash_dhead_dep_t;
+
+static int
+cfs_hash_dd_hhead_size(struct cfs_hash *hs)
+{
+ return sizeof(cfs_hash_dhead_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd)
+{
+ cfs_hash_dhead_dep_t *head;
+
+ head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0];
+ return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+ cfs_hash_dhead_dep_t, dd_head);
+
+ if (dh->dd_tail != NULL) /* not empty */
+ hlist_add_behind(hnode, dh->dd_tail);
+ else /* empty list */
+ hlist_add_head(hnode, &dh->dd_head);
+ dh->dd_tail = hnode;
+ return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnd)
+{
+ cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+ cfs_hash_dhead_dep_t, dd_head);
+
+ if (hnd->next == NULL) { /* it's the tail */
+ dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+ container_of(hnd->pprev, struct hlist_node, next);
+ }
+ hlist_del_init(hnd);
+ return --dh->dd_depth;
+}
+
+static cfs_hash_hlist_ops_t cfs_hash_hh_hops = {
+ .hop_hhead = cfs_hash_hh_hhead,
+ .hop_hhead_size = cfs_hash_hh_hhead_size,
+ .hop_hnode_add = cfs_hash_hh_hnode_add,
+ .hop_hnode_del = cfs_hash_hh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_hd_hops = {
+ .hop_hhead = cfs_hash_hd_hhead,
+ .hop_hhead_size = cfs_hash_hd_hhead_size,
+ .hop_hnode_add = cfs_hash_hd_hnode_add,
+ .hop_hnode_del = cfs_hash_hd_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dh_hops = {
+ .hop_hhead = cfs_hash_dh_hhead,
+ .hop_hhead_size = cfs_hash_dh_hhead_size,
+ .hop_hnode_add = cfs_hash_dh_hnode_add,
+ .hop_hnode_del = cfs_hash_dh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dd_hops = {
+ .hop_hhead = cfs_hash_dd_hhead,
+ .hop_hhead_size = cfs_hash_dd_hhead_size,
+ .hop_hnode_add = cfs_hash_dd_hnode_add,
+ .hop_hnode_del = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(struct cfs_hash *hs)
+{
+ if (cfs_hash_with_add_tail(hs)) {
+ hs->hs_hops = cfs_hash_with_depth(hs) ?
+ &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+ } else {
+ hs->hs_hops = cfs_hash_with_depth(hs) ?
+ &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+ }
+}
+
+static void
+cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts,
+ unsigned int bits, const void *key, struct cfs_hash_bd *bd)
+{
+ unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+ LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+ bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+ bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd)
+{
+ /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+ if (likely(hs->hs_rehash_buckets == NULL)) {
+ cfs_hash_bd_from_key(hs, hs->hs_buckets,
+ hs->hs_cur_bits, key, bd);
+ } else {
+ LASSERT(hs->hs_rehash_bits != 0);
+ cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+ hs->hs_rehash_bits, key, bd);
+ }
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur)
+{
+ if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+ return;
+
+ bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+ if (likely(warn_on_depth == 0 ||
+ max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+ return;
+
+ spin_lock(&hs->hs_dep_lock);
+ hs->hs_dep_max = dep_cur;
+ hs->hs_dep_bkt = bd->bd_bucket->hsb_index;
+ hs->hs_dep_off = bd->bd_offset;
+ hs->hs_dep_bits = hs->hs_cur_bits;
+ spin_unlock(&hs->hs_dep_lock);
+
+ cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ int rc;
+
+ rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+ cfs_hash_bd_dep_record(hs, bd, rc);
+ bd->bd_bucket->hsb_version++;
+ if (unlikely(bd->bd_bucket->hsb_version == 0))
+ bd->bd_bucket->hsb_version++;
+ bd->bd_bucket->hsb_count++;
+
+ if (cfs_hash_with_counter(hs))
+ atomic_inc(&hs->hs_count);
+ if (!cfs_hash_with_no_itemref(hs))
+ cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode)
+{
+ hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+ LASSERT(bd->bd_bucket->hsb_count > 0);
+ bd->bd_bucket->hsb_count--;
+ bd->bd_bucket->hsb_version++;
+ if (unlikely(bd->bd_bucket->hsb_version == 0))
+ bd->bd_bucket->hsb_version++;
+
+ if (cfs_hash_with_counter(hs)) {
+ LASSERT(atomic_read(&hs->hs_count) > 0);
+ atomic_dec(&hs->hs_count);
+ }
+ if (!cfs_hash_with_no_itemref(hs))
+ cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old,
+ struct cfs_hash_bd *bd_new, struct hlist_node *hnode)
+{
+ struct cfs_hash_bucket *obkt = bd_old->bd_bucket;
+ struct cfs_hash_bucket *nbkt = bd_new->bd_bucket;
+ int rc;
+
+ if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+ return;
+
+ /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+ * in cfs_hash_bd_del/add_locked */
+ hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+ rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+ cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+ LASSERT(obkt->hsb_count > 0);
+ obkt->hsb_count--;
+ obkt->hsb_version++;
+ if (unlikely(obkt->hsb_version == 0))
+ obkt->hsb_version++;
+ nbkt->hsb_count++;
+ nbkt->hsb_version++;
+ if (unlikely(nbkt->hsb_version == 0))
+ nbkt->hsb_version++;
+}
+EXPORT_SYMBOL(cfs_hash_bd_move_locked);
+
+enum {
+ /** always set, for sanity (avoid ZERO intent) */
+ CFS_HS_LOOKUP_MASK_FIND = 1 << 0,
+ /** return entry with a ref */
+ CFS_HS_LOOKUP_MASK_REF = 1 << 1,
+ /** add entry if not existing */
+ CFS_HS_LOOKUP_MASK_ADD = 1 << 2,
+ /** delete entry, ignore other masks */
+ CFS_HS_LOOKUP_MASK_DEL = 1 << 3,
+};
+
+typedef enum cfs_hash_lookup_intent {
+ /** return item w/o refcount */
+ CFS_HS_LOOKUP_IT_PEEK = CFS_HS_LOOKUP_MASK_FIND,
+ /** return item with refcount */
+ CFS_HS_LOOKUP_IT_FIND = (CFS_HS_LOOKUP_MASK_FIND |
+ CFS_HS_LOOKUP_MASK_REF),
+ /** return item w/o refcount if existed, otherwise add */
+ CFS_HS_LOOKUP_IT_ADD = (CFS_HS_LOOKUP_MASK_FIND |
+ CFS_HS_LOOKUP_MASK_ADD),
+ /** return item with refcount if existed, otherwise add */
+ CFS_HS_LOOKUP_IT_FINDADD = (CFS_HS_LOOKUP_IT_FIND |
+ CFS_HS_LOOKUP_MASK_ADD),
+ /** delete if existed */
+ CFS_HS_LOOKUP_IT_FINDDEL = (CFS_HS_LOOKUP_MASK_FIND |
+ CFS_HS_LOOKUP_MASK_DEL)
+} cfs_hash_lookup_intent_t;
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ const void *key, struct hlist_node *hnode,
+ cfs_hash_lookup_intent_t intent)
+
+{
+ struct hlist_head *hhead = cfs_hash_bd_hhead(hs, bd);
+ struct hlist_node *ehnode;
+ struct hlist_node *match;
+ int intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+ /* with this function, we can avoid a lot of useless refcount ops,
+ * which are expensive atomic operations most time. */
+ match = intent_add ? NULL : hnode;
+ hlist_for_each(ehnode, hhead) {
+ if (!cfs_hash_keycmp(hs, key, ehnode))
+ continue;
+
+ if (match != NULL && match != ehnode) /* can't match */
+ continue;
+
+ /* match and ... */
+ if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+ cfs_hash_bd_del_locked(hs, bd, ehnode);
+ return ehnode;
+ }
+
+ /* caller wants refcount? */
+ if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+ cfs_hash_get(hs, ehnode);
+ return ehnode;
+ }
+ /* no match item */
+ if (!intent_add)
+ return NULL;
+
+ LASSERT(hnode != NULL);
+ cfs_hash_bd_add_locked(hs, bd, hnode);
+ return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key)
+{
+ return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+ CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, const void *key)
+{
+ return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+ CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+struct hlist_node *
+cfs_hash_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ const void *key, struct hlist_node *hnode,
+ int noref)
+{
+ return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+ CFS_HS_LOOKUP_IT_ADD |
+ (!noref * CFS_HS_LOOKUP_MASK_REF));
+}
+EXPORT_SYMBOL(cfs_hash_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ const void *key, struct hlist_node *hnode)
+{
+ /* hnode can be NULL, we find the first item with @key */
+ return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+ CFS_HS_LOOKUP_IT_FINDDEL);
+}
+EXPORT_SYMBOL(cfs_hash_bd_finddel_locked);
+
+static void
+cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+ unsigned n, int excl)
+{
+ struct cfs_hash_bucket *prev = NULL;
+ int i;
+
+ /**
+ * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+ * NB: it's possible that several bds point to the same bucket but
+ * have different bd::bd_offset, so need take care of deadlock.
+ */
+ cfs_hash_for_each_bd(bds, n, i) {
+ if (prev == bds[i].bd_bucket)
+ continue;
+
+ LASSERT(prev == NULL ||
+ prev->hsb_index < bds[i].bd_bucket->hsb_index);
+ cfs_hash_bd_lock(hs, &bds[i], excl);
+ prev = bds[i].bd_bucket;
+ }
+}
+
+static void
+cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+ unsigned n, int excl)
+{
+ struct cfs_hash_bucket *prev = NULL;
+ int i;
+
+ cfs_hash_for_each_bd(bds, n, i) {
+ if (prev != bds[i].bd_bucket) {
+ cfs_hash_bd_unlock(hs, &bds[i], excl);
+ prev = bds[i].bd_bucket;
+ }
+ }
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+ unsigned n, const void *key)
+{
+ struct hlist_node *ehnode;
+ unsigned i;
+
+ cfs_hash_for_each_bd(bds, n, i) {
+ ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+ CFS_HS_LOOKUP_IT_FIND);
+ if (ehnode != NULL)
+ return ehnode;
+ }
+ return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs,
+ struct cfs_hash_bd *bds, unsigned n, const void *key,
+ struct hlist_node *hnode, int noref)
+{
+ struct hlist_node *ehnode;
+ int intent;
+ unsigned i;
+
+ LASSERT(hnode != NULL);
+ intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+ cfs_hash_for_each_bd(bds, n, i) {
+ ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+ NULL, intent);
+ if (ehnode != NULL)
+ return ehnode;
+ }
+
+ if (i == 1) { /* only one bucket */
+ cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+ } else {
+ struct cfs_hash_bd mybd;
+
+ cfs_hash_bd_get(hs, key, &mybd);
+ cfs_hash_bd_add_locked(hs, &mybd, hnode);
+ }
+
+ return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+ unsigned n, const void *key,
+ struct hlist_node *hnode)
+{
+ struct hlist_node *ehnode;
+ unsigned i;
+
+ cfs_hash_for_each_bd(bds, n, i) {
+ ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+ CFS_HS_LOOKUP_IT_FINDDEL);
+ if (ehnode != NULL)
+ return ehnode;
+ }
+ return NULL;
+}
+
+static void
+cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2)
+{
+ int rc;
+
+ if (bd2->bd_bucket == NULL)
+ return;
+
+ if (bd1->bd_bucket == NULL) {
+ *bd1 = *bd2;
+ bd2->bd_bucket = NULL;
+ return;
+ }
+
+ rc = cfs_hash_bd_compare(bd1, bd2);
+ if (rc == 0) {
+ bd2->bd_bucket = NULL;
+
+ } else if (rc > 0) { /* swab bd1 and bd2 */
+ struct cfs_hash_bd tmp;
+
+ tmp = *bd2;
+ *bd2 = *bd1;
+ *bd1 = tmp;
+ }
+}
+
+void
+cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bds)
+{
+ /* NB: caller should hold hs_lock.rw if REHASH is set */
+ cfs_hash_bd_from_key(hs, hs->hs_buckets,
+ hs->hs_cur_bits, key, &bds[0]);
+ if (likely(hs->hs_rehash_buckets == NULL)) {
+ /* no rehash or not rehashing */
+ bds[1].bd_bucket = NULL;
+ return;
+ }
+
+ LASSERT(hs->hs_rehash_bits != 0);
+ cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+ hs->hs_rehash_bits, key, &bds[1]);
+
+ cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_get);
+
+void
+cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+ cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lock);
+
+void
+cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl)
+{
+ cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_unlock);
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+ const void *key)
+{
+ return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+ const void *key, struct hlist_node *hnode,
+ int noref)
+{
+ return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+ hnode, noref);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds,
+ const void *key, struct hlist_node *hnode)
+{
+ return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked);
+
+static void
+cfs_hash_buckets_free(struct cfs_hash_bucket **buckets,
+ int bkt_size, int prev_size, int size)
+{
+ int i;
+
+ for (i = prev_size; i < size; i++) {
+ if (buckets[i] != NULL)
+ LIBCFS_FREE(buckets[i], bkt_size);
+ }
+
+ LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static struct cfs_hash_bucket **
+cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts,
+ unsigned int old_size, unsigned int new_size)
+{
+ struct cfs_hash_bucket **new_bkts;
+ int i;
+
+ LASSERT(old_size == 0 || old_bkts != NULL);
+
+ if (old_bkts != NULL && old_size == new_size)
+ return old_bkts;
+
+ LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+ if (new_bkts == NULL)
+ return NULL;
+
+ if (old_bkts != NULL) {
+ memcpy(new_bkts, old_bkts,
+ min(old_size, new_size) * sizeof(*old_bkts));
+ }
+
+ for (i = old_size; i < new_size; i++) {
+ struct hlist_head *hhead;
+ struct cfs_hash_bd bd;
+
+ LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+ if (new_bkts[i] == NULL) {
+ cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+ old_size, new_size);
+ return NULL;
+ }
+
+ new_bkts[i]->hsb_index = i;
+ new_bkts[i]->hsb_version = 1; /* shouldn't be zero */
+ new_bkts[i]->hsb_depmax = -1; /* unknown */
+ bd.bd_bucket = new_bkts[i];
+ cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+ INIT_HLIST_HEAD(hhead);
+
+ if (cfs_hash_with_no_lock(hs) ||
+ cfs_hash_with_no_bktlock(hs))
+ continue;
+
+ if (cfs_hash_with_rw_bktlock(hs))
+ rwlock_init(&new_bkts[i]->hsb_lock.rw);
+ else if (cfs_hash_with_spin_bktlock(hs))
+ spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+ else
+ LBUG(); /* invalid use-case */
+ }
+ return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops - Registered hash table operations
+ * @flags - CFS_HASH_REHASH enable synamic hash resizing
+ * - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(cfs_workitem_t *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(cfs_workitem_t *wi)
+{
+ struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_dep_wi);
+ int dep;
+ int bkt;
+ int off;
+ int bits;
+
+ spin_lock(&hs->hs_dep_lock);
+ dep = hs->hs_dep_max;
+ bkt = hs->hs_dep_bkt;
+ off = hs->hs_dep_off;
+ bits = hs->hs_dep_bits;
+ spin_unlock(&hs->hs_dep_lock);
+
+ LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+ hs->hs_name, bits, dep, bkt, off);
+ spin_lock(&hs->hs_dep_lock);
+ hs->hs_dep_bits = 0; /* mark as workitem done */
+ spin_unlock(&hs->hs_dep_lock);
+ return 0;
+}
+
+static void cfs_hash_depth_wi_init(struct cfs_hash *hs)
+{
+ spin_lock_init(&hs->hs_dep_lock);
+ cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs)
+{
+ if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+ return;
+
+ spin_lock(&hs->hs_dep_lock);
+ while (hs->hs_dep_bits != 0) {
+ spin_unlock(&hs->hs_dep_lock);
+ cond_resched();
+ spin_lock(&hs->hs_dep_lock);
+ }
+ spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {}
+static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+struct cfs_hash *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+ unsigned bkt_bits, unsigned extra_bytes,
+ unsigned min_theta, unsigned max_theta,
+ cfs_hash_ops_t *ops, unsigned flags)
+{
+ struct cfs_hash *hs;
+ int len;
+
+ CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+ LASSERT(name != NULL);
+ LASSERT(ops != NULL);
+ LASSERT(ops->hs_key);
+ LASSERT(ops->hs_hash);
+ LASSERT(ops->hs_object);
+ LASSERT(ops->hs_keycmp);
+ LASSERT(ops->hs_get != NULL);
+ LASSERT(ops->hs_put_locked != NULL);
+
+ if ((flags & CFS_HASH_REHASH) != 0)
+ flags |= CFS_HASH_COUNTER; /* must have counter */
+
+ LASSERT(cur_bits > 0);
+ LASSERT(cur_bits >= bkt_bits);
+ LASSERT(max_bits >= cur_bits && max_bits < 31);
+ LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+ LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+ (flags & CFS_HASH_NO_LOCK) == 0));
+ LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+ ops->hs_keycpy != NULL));
+
+ len = (flags & CFS_HASH_BIGNAME) == 0 ?
+ CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+ LIBCFS_ALLOC(hs, offsetof(struct cfs_hash, hs_name[len]));
+ if (hs == NULL)
+ return NULL;
+
+ strncpy(hs->hs_name, name, len);
+ hs->hs_name[len - 1] = '\0';
+ hs->hs_flags = flags;
+
+ atomic_set(&hs->hs_refcount, 1);
+ atomic_set(&hs->hs_count, 0);
+
+ cfs_hash_lock_setup(hs);
+ cfs_hash_hlist_setup(hs);
+
+ hs->hs_cur_bits = (__u8)cur_bits;
+ hs->hs_min_bits = (__u8)cur_bits;
+ hs->hs_max_bits = (__u8)max_bits;
+ hs->hs_bkt_bits = (__u8)bkt_bits;
+
+ hs->hs_ops = ops;
+ hs->hs_extra_bytes = extra_bytes;
+ hs->hs_rehash_bits = 0;
+ cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+ cfs_hash_depth_wi_init(hs);
+
+ if (cfs_hash_with_rehash(hs))
+ __cfs_hash_set_theta(hs, min_theta, max_theta);
+
+ hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+ CFS_HASH_NBKT(hs));
+ if (hs->hs_buckets != NULL)
+ return hs;
+
+ LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[len]));
+ return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(struct cfs_hash *hs)
+{
+ struct hlist_node *hnode;
+ struct hlist_node *pos;
+ struct cfs_hash_bd bd;
+ int i;
+
+ LASSERT(hs != NULL);
+ LASSERT(!cfs_hash_is_exiting(hs) &&
+ !cfs_hash_is_iterating(hs));
+
+ /**
+ * prohibit further rehashes, don't need any lock because
+ * I'm the only (last) one can change it.
+ */
+ hs->hs_exiting = 1;
+ if (cfs_hash_with_rehash(hs))
+ cfs_hash_rehash_cancel(hs);
+
+ cfs_hash_depth_wi_cancel(hs);
+ /* rehash should be done/canceled */
+ LASSERT(hs->hs_buckets != NULL &&
+ hs->hs_rehash_buckets == NULL);
+
+ cfs_hash_for_each_bucket(hs, &bd, i) {
+ struct hlist_head *hhead;
+
+ LASSERT(bd.bd_bucket != NULL);
+ /* no need to take this lock, just for consistent code */
+ cfs_hash_bd_lock(hs, &bd, 1);
+
+ cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+ hlist_for_each_safe(hnode, pos, hhead) {
+ LASSERTF(!cfs_hash_with_assert_empty(hs),
+ "hash %s bucket %u(%u) is not empty: %u items left\n",
+ hs->hs_name, bd.bd_bucket->hsb_index,
+ bd.bd_offset, bd.bd_bucket->hsb_count);
+ /* can't assert key valicate, because we
+ * can interrupt rehash */
+ cfs_hash_bd_del_locked(hs, &bd, hnode);
+ cfs_hash_exit(hs, hnode);
+ }
+ }
+ LASSERT(bd.bd_bucket->hsb_count == 0);
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ cond_resched();
+ }
+
+ LASSERT(atomic_read(&hs->hs_count) == 0);
+
+ cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+ 0, CFS_HASH_NBKT(hs));
+ i = cfs_hash_with_bigname(hs) ?
+ CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+ LIBCFS_FREE(hs, offsetof(struct cfs_hash, hs_name[i]));
+}
+
+struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs)
+{
+ if (atomic_inc_not_zero(&hs->hs_refcount))
+ return hs;
+ return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(struct cfs_hash *hs)
+{
+ if (atomic_dec_and_test(&hs->hs_refcount))
+ cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(struct cfs_hash *hs)
+{
+ if (cfs_hash_with_no_lock(hs) ||
+ !cfs_hash_with_rehash(hs))
+ return -EOPNOTSUPP;
+
+ if (unlikely(cfs_hash_is_exiting(hs)))
+ return -ESRCH;
+
+ if (unlikely(cfs_hash_is_rehashing(hs)))
+ return -EALREADY;
+
+ if (unlikely(cfs_hash_is_iterating(hs)))
+ return -EAGAIN;
+
+ /* XXX: need to handle case with max_theta != 2.0
+ * and the case with min_theta != 0.5 */
+ if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+ (__cfs_hash_theta(hs) > hs->hs_max_theta))
+ return hs->hs_cur_bits + 1;
+
+ if (!cfs_hash_with_shrink(hs))
+ return 0;
+
+ if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+ (__cfs_hash_theta(hs) < hs->hs_min_theta))
+ return hs->hs_cur_bits - 1;
+
+ return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(struct cfs_hash *hs)
+{
+ return !cfs_hash_with_nblk_change(hs) &&
+ atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key. The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+ struct cfs_hash_bd bd;
+ int bits;
+
+ LASSERT(hlist_unhashed(hnode));
+
+ cfs_hash_lock(hs, 0);
+ cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+ cfs_hash_key_validate(hs, key, hnode);
+ cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+ cfs_hash_bd_unlock(hs, &bd, 1);
+
+ bits = cfs_hash_rehash_bits(hs);
+ cfs_hash_unlock(hs, 0);
+ if (bits > 0)
+ cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(struct cfs_hash *hs, const void *key,
+ struct hlist_node *hnode, int noref)
+{
+ struct hlist_node *ehnode;
+ struct cfs_hash_bd bds[2];
+ int bits = 0;
+
+ LASSERT(hlist_unhashed(hnode));
+
+ cfs_hash_lock(hs, 0);
+ cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+ cfs_hash_key_validate(hs, key, hnode);
+ ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+ hnode, noref);
+ cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+ if (ehnode == hnode) /* new item added */
+ bits = cfs_hash_rehash_bits(hs);
+ cfs_hash_unlock(hs, 0);
+ if (bits > 0)
+ cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+ return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key. The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+ return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+ -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key. If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key,
+ struct hlist_node *hnode)
+{
+ hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+ return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key. The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket. The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode)
+{
+ void *obj = NULL;
+ int bits = 0;
+ struct cfs_hash_bd bds[2];
+
+ cfs_hash_lock(hs, 0);
+ cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+ /* NB: do nothing if @hnode is not in hash table */
+ if (hnode == NULL || !hlist_unhashed(hnode)) {
+ if (bds[1].bd_bucket == NULL && hnode != NULL) {
+ cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+ } else {
+ hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+ key, hnode);
+ }
+ }
+
+ if (hnode != NULL) {
+ obj = cfs_hash_object(hs, hnode);
+ bits = cfs_hash_rehash_bits(hs);
+ }
+
+ cfs_hash_dual_bd_unlock(hs, bds, 1);
+ cfs_hash_unlock(hs, 0);
+ if (bits > 0)
+ cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+ return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs. The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key. The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(struct cfs_hash *hs, const void *key)
+{
+ return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned. It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object. If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(struct cfs_hash *hs, const void *key)
+{
+ void *obj = NULL;
+ struct hlist_node *hnode;
+ struct cfs_hash_bd bds[2];
+
+ cfs_hash_lock(hs, 0);
+ cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+ hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+ if (hnode != NULL)
+ obj = cfs_hash_object(hs, hnode);
+
+ cfs_hash_dual_bd_unlock(hs, bds, 0);
+ cfs_hash_unlock(hs, 0);
+
+ return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(struct cfs_hash *hs) {
+ LASSERT(!cfs_hash_is_exiting(hs));
+
+ if (!cfs_hash_with_rehash(hs))
+ return;
+ /*
+ * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+ * because it's just an unreliable signal to rehash-thread,
+ * rehash-thread will try to finish rehash ASAP when seeing this.
+ */
+ hs->hs_iterating = 1;
+
+ cfs_hash_lock(hs, 1);
+ hs->hs_iterators++;
+
+ /* NB: iteration is mostly called by service thread,
+ * we tend to cancel pending rehash-request, instead of
+ * blocking service thread, we will relaunch rehash request
+ * after iteration */
+ if (cfs_hash_is_rehashing(hs))
+ cfs_hash_rehash_cancel_locked(hs);
+ cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(struct cfs_hash *hs) {
+ int remained;
+ int bits;
+
+ if (!cfs_hash_with_rehash(hs))
+ return;
+ cfs_hash_lock(hs, 1);
+ remained = --hs->hs_iterators;
+ bits = cfs_hash_rehash_bits(hs);
+ cfs_hash_unlock(hs, 1);
+ /* NB: it's race on cfs_has_t::hs_iterating, see above */
+ if (remained == 0)
+ hs->hs_iterating = 0;
+ if (bits > 0) {
+ cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+ CFS_HASH_LOOP_HOG);
+ }
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ * . the bucket lock is held so the callback must never sleep.
+ * . if @removal_safe is true, use can remove current item by
+ * cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+ void *data, int remove_safe) {
+ struct hlist_node *hnode;
+ struct hlist_node *pos;
+ struct cfs_hash_bd bd;
+ __u64 count = 0;
+ int excl = !!remove_safe;
+ int loop = 0;
+ int i;
+
+ cfs_hash_for_each_enter(hs);
+
+ cfs_hash_lock(hs, 0);
+ LASSERT(!cfs_hash_is_rehashing(hs));
+
+ cfs_hash_for_each_bucket(hs, &bd, i) {
+ struct hlist_head *hhead;
+
+ cfs_hash_bd_lock(hs, &bd, excl);
+ if (func == NULL) { /* only glimpse size */
+ count += bd.bd_bucket->hsb_count;
+ cfs_hash_bd_unlock(hs, &bd, excl);
+ continue;
+ }
+
+ cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+ hlist_for_each_safe(hnode, pos, hhead) {
+ cfs_hash_bucket_validate(hs, &bd, hnode);
+ count++;
+ loop++;
+ if (func(hs, &bd, hnode, data)) {
+ cfs_hash_bd_unlock(hs, &bd, excl);
+ goto out;
+ }
+ }
+ }
+ cfs_hash_bd_unlock(hs, &bd, excl);
+ if (loop < CFS_HASH_LOOP_HOG)
+ continue;
+ loop = 0;
+ cfs_hash_unlock(hs, 0);
+ cond_resched();
+ cfs_hash_lock(hs, 0);
+ }
+ out:
+ cfs_hash_unlock(hs, 0);
+
+ cfs_hash_for_each_exit(hs);
+ return count;
+}
+
+typedef struct {
+ cfs_hash_cond_opt_cb_t func;
+ void *arg;
+} cfs_hash_cond_arg_t;
+
+static int
+cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *data)
+{
+ cfs_hash_cond_arg_t *cond = data;
+
+ if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+ cfs_hash_bd_del_locked(hs, bd, hnode);
+ return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+ cfs_hash_cond_arg_t arg = {
+ .func = func,
+ .arg = data,
+ };
+
+ cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(struct cfs_hash *hs,
+ cfs_hash_for_each_cb_t func, void *data)
+{
+ cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(struct cfs_hash *hs,
+ cfs_hash_for_each_cb_t func, void *data) {
+ cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *data)
+{
+ *(int *)data = 0;
+ return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(struct cfs_hash *hs)
+{
+ int empty = 1;
+
+ cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+ return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(struct cfs_hash *hs)
+{
+ return cfs_hash_with_counter(hs) ?
+ atomic_read(&hs->hs_count) :
+ cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ * a. if rehash_key is enabled, an item can be moved from
+ * one bucket to another bucket
+ * b. user can remove non-zero-ref item from hash-table,
+ * so the item can be removed from hash-table, even worse,
+ * it's possible that user changed key and insert to another
+ * hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func,
+ void *data) {
+ struct hlist_node *hnode;
+ struct hlist_node *tmp;
+ struct cfs_hash_bd bd;
+ __u32 version;
+ int count = 0;
+ int stop_on_change;
+ int rc;
+ int i;
+
+ stop_on_change = cfs_hash_with_rehash_key(hs) ||
+ !cfs_hash_with_no_itemref(hs) ||
+ hs->hs_ops->hs_put_locked == NULL;
+ cfs_hash_lock(hs, 0);
+ LASSERT(!cfs_hash_is_rehashing(hs));
+
+ cfs_hash_for_each_bucket(hs, &bd, i) {
+ struct hlist_head *hhead;
+
+ cfs_hash_bd_lock(hs, &bd, 0);
+ version = cfs_hash_bd_version_get(&bd);
+
+ cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+ for (hnode = hhead->first; hnode != NULL;) {
+ cfs_hash_bucket_validate(hs, &bd, hnode);
+ cfs_hash_get(hs, hnode);
+ cfs_hash_bd_unlock(hs, &bd, 0);
+ cfs_hash_unlock(hs, 0);
+
+ rc = func(hs, &bd, hnode, data);
+ if (stop_on_change)
+ cfs_hash_put(hs, hnode);
+ cond_resched();
+ count++;
+
+ cfs_hash_lock(hs, 0);
+ cfs_hash_bd_lock(hs, &bd, 0);
+ if (!stop_on_change) {
+ tmp = hnode->next;
+ cfs_hash_put_locked(hs, hnode);
+ hnode = tmp;
+ } else { /* bucket changed? */
+ if (version !=
+ cfs_hash_bd_version_get(&bd))
+ break;
+ /* safe to continue because no change */
+ hnode = hnode->next;
+ }
+ if (rc) /* callback wants to break iteration */
+ break;
+ }
+ }
+ cfs_hash_bd_unlock(hs, &bd, 0);
+ }
+ cfs_hash_unlock(hs, 0);
+
+ return count;
+}
+
+int
+cfs_hash_for_each_nolock(struct cfs_hash *hs,
+ cfs_hash_for_each_cb_t func, void *data) {
+ if (cfs_hash_with_no_lock(hs) ||
+ cfs_hash_with_rehash_key(hs) ||
+ !cfs_hash_with_no_itemref(hs))
+ return -EOPNOTSUPP;
+
+ if (hs->hs_ops->hs_get == NULL ||
+ (hs->hs_ops->hs_put == NULL &&
+ hs->hs_ops->hs_put_locked == NULL))
+ return -EOPNOTSUPP;
+
+ cfs_hash_for_each_enter(hs);
+ cfs_hash_for_each_relax(hs, func, data);
+ cfs_hash_for_each_exit(hs);
+
+ return 0;
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty. The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash. You may either use the cfs_hash_del() or hlist_del()
+ * functions. No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed. This function will not terminate until the
+ * hash is empty. Note it is still possible to concurrently add new
+ * items in to the hash. It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(struct cfs_hash *hs,
+ cfs_hash_for_each_cb_t func, void *data) {
+ unsigned i = 0;
+
+ if (cfs_hash_with_no_lock(hs))
+ return -EOPNOTSUPP;
+
+ if (hs->hs_ops->hs_get == NULL ||
+ (hs->hs_ops->hs_put == NULL &&
+ hs->hs_ops->hs_put_locked == NULL))
+ return -EOPNOTSUPP;
+
+ cfs_hash_for_each_enter(hs);
+ while (cfs_hash_for_each_relax(hs, func, data)) {
+ CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+ hs->hs_name, i++);
+ }
+ cfs_hash_for_each_exit(hs);
+ return 0;
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned hindex,
+ cfs_hash_for_each_cb_t func, void *data)
+{
+ struct hlist_head *hhead;
+ struct hlist_node *hnode;
+ struct cfs_hash_bd bd;
+
+ cfs_hash_for_each_enter(hs);
+ cfs_hash_lock(hs, 0);
+ if (hindex >= CFS_HASH_NHLIST(hs))
+ goto out;
+
+ cfs_hash_bd_index_set(hs, hindex, &bd);
+
+ cfs_hash_bd_lock(hs, &bd, 0);
+ hhead = cfs_hash_bd_hhead(hs, &bd);
+ hlist_for_each(hnode, hhead) {
+ if (func(hs, &bd, hnode, data))
+ break;
+ }
+ cfs_hash_bd_unlock(hs, &bd, 0);
+ out:
+ cfs_hash_unlock(hs, 0);
+ cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+ */
+void
+cfs_hash_for_each_key(struct cfs_hash *hs, const void *key,
+ cfs_hash_for_each_cb_t func, void *data) {
+ struct hlist_node *hnode;
+ struct cfs_hash_bd bds[2];
+ unsigned i;
+
+ cfs_hash_lock(hs, 0);
+
+ cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+ cfs_hash_for_each_bd(bds, 2, i) {
+ struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+ hlist_for_each(hnode, hlist) {
+ cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+ if (cfs_hash_keycmp(hs, key, hnode)) {
+ if (func(hs, &bds[i], hnode, data))
+ break;
+ }
+ }
+ }
+
+ cfs_hash_dual_bd_unlock(hs, bds, 0);
+ cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits. This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed. When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values. By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function. The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(struct cfs_hash *hs)
+{
+ int i;
+
+ /* need hold cfs_hash_lock(hs, 1) */
+ LASSERT(cfs_hash_with_rehash(hs) &&
+ !cfs_hash_with_no_lock(hs));
+
+ if (!cfs_hash_is_rehashing(hs))
+ return;
+
+ if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+ hs->hs_rehash_bits = 0;
+ return;
+ }
+
+ for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+ cfs_hash_unlock(hs, 1);
+ /* raise console warning while waiting too long */
+ CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO,
+ "hash %s is still rehashing, rescheded %d\n",
+ hs->hs_name, i - 1);
+ cond_resched();
+ cfs_hash_lock(hs, 1);
+ }
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked);
+
+void
+cfs_hash_rehash_cancel(struct cfs_hash *hs)
+{
+ cfs_hash_lock(hs, 1);
+ cfs_hash_rehash_cancel_locked(hs);
+ cfs_hash_unlock(hs, 1);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel);
+
+int
+cfs_hash_rehash(struct cfs_hash *hs, int do_rehash)
+{
+ int rc;
+
+ LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+ cfs_hash_lock(hs, 1);
+
+ rc = cfs_hash_rehash_bits(hs);
+ if (rc <= 0) {
+ cfs_hash_unlock(hs, 1);
+ return rc;
+ }
+
+ hs->hs_rehash_bits = rc;
+ if (!do_rehash) {
+ /* launch and return */
+ cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+ cfs_hash_unlock(hs, 1);
+ return 0;
+ }
+
+ /* rehash right now */
+ cfs_hash_unlock(hs, 1);
+
+ return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+EXPORT_SYMBOL(cfs_hash_rehash);
+
+static int
+cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old)
+{
+ struct cfs_hash_bd new;
+ struct hlist_head *hhead;
+ struct hlist_node *hnode;
+ struct hlist_node *pos;
+ void *key;
+ int c = 0;
+
+ /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+ cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+ hlist_for_each_safe(hnode, pos, hhead) {
+ key = cfs_hash_key(hs, hnode);
+ LASSERT(key != NULL);
+ /* Validate hnode is in the correct bucket. */
+ cfs_hash_bucket_validate(hs, old, hnode);
+ /*
+ * Delete from old hash bucket; move to new bucket.
+ * ops->hs_key must be defined.
+ */
+ cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+ hs->hs_rehash_bits, key, &new);
+ cfs_hash_bd_move_locked(hs, old, &new, hnode);
+ c++;
+ }
+ }
+
+ return c;
+}
+
+static int
+cfs_hash_rehash_worker(cfs_workitem_t *wi)
+{
+ struct cfs_hash *hs = container_of(wi, struct cfs_hash, hs_rehash_wi);
+ struct cfs_hash_bucket **bkts;
+ struct cfs_hash_bd bd;
+ unsigned int old_size;
+ unsigned int new_size;
+ int bsize;
+ int count = 0;
+ int rc = 0;
+ int i;
+
+ LASSERT (hs != NULL && cfs_hash_with_rehash(hs));
+
+ cfs_hash_lock(hs, 0);
+ LASSERT(cfs_hash_is_rehashing(hs));
+
+ old_size = CFS_HASH_NBKT(hs);
+ new_size = CFS_HASH_RH_NBKT(hs);
+
+ cfs_hash_unlock(hs, 0);
+
+ /*
+ * don't need hs::hs_rwlock for hs::hs_buckets,
+ * because nobody can change bkt-table except me.
+ */
+ bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+ old_size, new_size);
+ cfs_hash_lock(hs, 1);
+ if (bkts == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (bkts == hs->hs_buckets) {
+ bkts = NULL; /* do nothing */
+ goto out;
+ }
+
+ rc = __cfs_hash_theta(hs);
+ if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+ /* free the new allocated bkt-table */
+ old_size = new_size;
+ new_size = CFS_HASH_NBKT(hs);
+ rc = -EALREADY;
+ goto out;
+ }
+
+ LASSERT(hs->hs_rehash_buckets == NULL);
+ hs->hs_rehash_buckets = bkts;
+
+ rc = 0;
+ cfs_hash_for_each_bucket(hs, &bd, i) {
+ if (cfs_hash_is_exiting(hs)) {
+ rc = -ESRCH;
+ /* someone wants to destroy the hash, abort now */
+ if (old_size < new_size) /* OK to free old bkt-table */
+ break;
+ /* it's shrinking, need free new bkt-table */
+ hs->hs_rehash_buckets = NULL;
+ old_size = new_size;
+ new_size = CFS_HASH_NBKT(hs);
+ goto out;
+ }
+
+ count += cfs_hash_rehash_bd(hs, &bd);
+ if (count < CFS_HASH_LOOP_HOG ||
+ cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+ continue;
+ }
+
+ count = 0;
+ cfs_hash_unlock(hs, 1);
+ cond_resched();
+ cfs_hash_lock(hs, 1);
+ }
+
+ hs->hs_rehash_count++;
+
+ bkts = hs->hs_buckets;
+ hs->hs_buckets = hs->hs_rehash_buckets;
+ hs->hs_rehash_buckets = NULL;
+
+ hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+ hs->hs_rehash_bits = 0;
+ if (rc == -ESRCH) /* never be scheduled again */
+ cfs_wi_exit(cfs_sched_rehash, wi);
+ bsize = cfs_hash_bkt_size(hs);
+ cfs_hash_unlock(hs, 1);
+ /* can't refer to @hs anymore because it could be destroyed */
+ if (bkts != NULL)
+ cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+ if (rc != 0)
+ CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc);
+ /* return 1 only if cfs_wi_exit is called */
+ return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs. The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash. When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key,
+ void *new_key, struct hlist_node *hnode)
+{
+ struct cfs_hash_bd bds[3];
+ struct cfs_hash_bd old_bds[2];
+ struct cfs_hash_bd new_bd;
+
+ LASSERT(!hlist_unhashed(hnode));
+
+ cfs_hash_lock(hs, 0);
+
+ cfs_hash_dual_bd_get(hs, old_key, old_bds);
+ cfs_hash_bd_get(hs, new_key, &new_bd);
+
+ bds[0] = old_bds[0];
+ bds[1] = old_bds[1];
+ bds[2] = new_bd;
+
+ /* NB: bds[0] and bds[1] are ordered already */
+ cfs_hash_bd_order(&bds[1], &bds[2]);
+ cfs_hash_bd_order(&bds[0], &bds[1]);
+
+ cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+ if (likely(old_bds[1].bd_bucket == NULL)) {
+ cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+ } else {
+ cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+ cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+ }
+ /* overwrite key inside locks, otherwise may screw up with
+ * other operations, i.e: rehash */
+ cfs_hash_keycpy(hs, new_key, hnode);
+
+ cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+ cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+void cfs_hash_debug_header(struct seq_file *m)
+{
+ seq_printf(m, "%-*s cur min max theta t-min t-max flags rehash count maxdep maxdepb distribution\n",
+ CFS_HASH_BIGNAME_LEN, "name");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static struct cfs_hash_bucket **
+cfs_hash_full_bkts(struct cfs_hash *hs)
+{
+ /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+ if (hs->hs_rehash_buckets == NULL)
+ return hs->hs_buckets;
+
+ LASSERT(hs->hs_rehash_bits != 0);
+ return hs->hs_rehash_bits > hs->hs_cur_bits ?
+ hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(struct cfs_hash *hs)
+{
+ /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+ if (hs->hs_rehash_buckets == NULL)
+ return CFS_HASH_NBKT(hs);
+
+ LASSERT(hs->hs_rehash_bits != 0);
+ return hs->hs_rehash_bits > hs->hs_cur_bits ?
+ CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m)
+{
+ int dist[8] = { 0, };
+ int maxdep = -1;
+ int maxdepb = -1;
+ int total = 0;
+ int theta;
+ int i;
+
+ cfs_hash_lock(hs, 0);
+ theta = __cfs_hash_theta(hs);
+
+ seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d 0x%02x %6d ",
+ CFS_HASH_BIGNAME_LEN, hs->hs_name,
+ 1 << hs->hs_cur_bits, 1 << hs->hs_min_bits,
+ 1 << hs->hs_max_bits,
+ __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta),
+ __cfs_hash_theta_int(hs->hs_min_theta),
+ __cfs_hash_theta_frac(hs->hs_min_theta),
+ __cfs_hash_theta_int(hs->hs_max_theta),
+ __cfs_hash_theta_frac(hs->hs_max_theta),
+ hs->hs_flags, hs->hs_rehash_count);
+
+ /*
+ * The distribution is a summary of the chained hash depth in
+ * each of the libcfs hash buckets. Each buckets hsb_count is
+ * divided by the hash theta value and used to generate a
+ * histogram of the hash distribution. A uniform hash will
+ * result in all hash buckets being close to the average thus
+ * only the first few entries in the histogram will be non-zero.
+ * If you hash function results in a non-uniform hash the will
+ * be observable by outlier bucks in the distribution histogram.
+ *
+ * Uniform hash distribution: 128/128/0/0/0/0/0/0
+ * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1
+ */
+ for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+ struct cfs_hash_bd bd;
+
+ bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+ cfs_hash_bd_lock(hs, &bd, 0);
+ if (maxdep < bd.bd_bucket->hsb_depmax) {
+ maxdep = bd.bd_bucket->hsb_depmax;
+ maxdepb = ffz(~maxdep);
+ }
+ total += bd.bd_bucket->hsb_count;
+ dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++;
+ cfs_hash_bd_unlock(hs, &bd, 0);
+ }
+
+ seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb);
+ for (i = 0; i < 8; i++)
+ seq_printf(m, "%d%c", dist[i], (i == 7) ? '\n' : '/');
+
+ cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
new file mode 100644
index 000000000..d9b7c6b69
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
@@ -0,0 +1,240 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* This is the kernel side (liblustre as well). */
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data. First field of payload is always
+ * struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+ struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+ ssize_t count = kuch->kuc_msglen;
+ loff_t offset = 0;
+ mm_segment_t fs;
+ int rc = -ENOSYS;
+
+ if (filp == NULL || IS_ERR(filp))
+ return -EBADF;
+
+ if (kuch->kuc_magic != KUC_MAGIC) {
+ CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+ return -ENOSYS;
+ }
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ while (count > 0) {
+ rc = vfs_write(filp, (void __force __user *)payload,
+ count, &offset);
+ if (rc < 0)
+ break;
+ count -= rc;
+ payload += rc;
+ rc = 0;
+ }
+ set_fs(fs);
+
+ if (rc < 0)
+ CWARN("message send failed (%d)\n", rc);
+ else
+ CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+ return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group registration has a uid and a file pointer */
+struct kkuc_reg {
+ struct list_head kr_chain;
+ int kr_uid;
+ struct file *kr_fp;
+ __u32 kr_data;
+};
+static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {};
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identifier for this receiver
+ * @param group group number
+ */
+int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data)
+{
+ struct kkuc_reg *reg;
+
+ if (group > KUC_GRP_MAX) {
+ CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+ return -EINVAL;
+ }
+
+ /* fput in group_rem */
+ if (filp == NULL)
+ return -EBADF;
+
+ /* freed in group_rem */
+ reg = kmalloc(sizeof(*reg), 0);
+ if (reg == NULL)
+ return -ENOMEM;
+
+ reg->kr_fp = filp;
+ reg->kr_uid = uid;
+ reg->kr_data = data;
+
+ down_write(&kg_sem);
+ if (kkuc_groups[group].next == NULL)
+ INIT_LIST_HEAD(&kkuc_groups[group]);
+ list_add(&reg->kr_chain, &kkuc_groups[group]);
+ up_write(&kg_sem);
+
+ CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+ return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(int uid, int group)
+{
+ struct kkuc_reg *reg, *next;
+
+ if (kkuc_groups[group].next == NULL)
+ return 0;
+
+ if (uid == 0) {
+ /* Broadcast a shutdown message */
+ struct kuc_hdr lh;
+
+ lh.kuc_magic = KUC_MAGIC;
+ lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+ lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+ lh.kuc_msglen = sizeof(lh);
+ libcfs_kkuc_group_put(group, &lh);
+ }
+
+ down_write(&kg_sem);
+ list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+ if ((uid == 0) || (uid == reg->kr_uid)) {
+ list_del(&reg->kr_chain);
+ CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+ reg->kr_uid, reg->kr_fp, group);
+ if (reg->kr_fp != NULL)
+ fput(reg->kr_fp);
+ kfree(reg);
+ }
+ }
+ up_write(&kg_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(int group, void *payload)
+{
+ struct kkuc_reg *reg;
+ int rc = 0;
+ int one_success = 0;
+
+ down_read(&kg_sem);
+ list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+ if (reg->kr_fp != NULL) {
+ rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+ if (rc == 0)
+ one_success = 1;
+ else if (rc == -EPIPE) {
+ fput(reg->kr_fp);
+ reg->kr_fp = NULL;
+ }
+ }
+ }
+ up_read(&kg_sem);
+
+ /* don't return an error if the message has been delivered
+ * at least to one agent */
+ if (one_success)
+ rc = 0;
+
+ return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg iextra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+ void *cb_arg)
+{
+ struct kkuc_reg *reg;
+ int rc = 0;
+
+ if (group > KUC_GRP_MAX) {
+ CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+ return -EINVAL;
+ }
+
+ /* no link for this group */
+ if (kkuc_groups[group].next == NULL)
+ return 0;
+
+ down_write(&kg_sem);
+ list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+ if (reg->kr_fp != NULL)
+ rc = cb_func(reg->kr_data, cb_arg);
+ }
+ up_write(&kg_sem);
+
+ return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
new file mode 100644
index 000000000..31a558115
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
@@ -0,0 +1,224 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/** Global CPU partition table */
+struct cfs_cpt_table *cfs_cpt_table __read_mostly;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC 0xbabecafe
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+ struct cfs_cpt_table *cptab;
+
+ if (ncpt != 1) {
+ CERROR("Can't support cpu partition number %d\n", ncpt);
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(cptab, sizeof(*cptab));
+ if (cptab != NULL) {
+ cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+ cptab->ctb_nparts = ncpt;
+ }
+
+ return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+ LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+ LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+#ifdef CONFIG_SMP
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+ int rc = 0;
+
+ rc = snprintf(buf, len, "%d\t: %d\n", 0, 0);
+ len -= rc;
+ if (len <= 0)
+ return -EFBIG;
+
+ return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+#endif /* CONFIG_SMP */
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+ return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpu_ht_nsiblings(int cpu)
+{
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+ return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+ return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+ return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void
+cfs_cpu_fini(void)
+{
+ if (cfs_cpt_table != NULL) {
+ cfs_cpt_table_free(cfs_cpt_table);
+ cfs_cpt_table = NULL;
+ }
+}
+
+int
+cfs_cpu_init(void)
+{
+ cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+ return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
new file mode 100644
index 000000000..2c199c725
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+ LASSERT(pcl->pcl_locks != NULL);
+ LASSERT(!pcl->pcl_locked);
+
+ cfs_percpt_free(pcl->pcl_locks);
+ LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab)
+{
+ struct cfs_percpt_lock *pcl;
+ spinlock_t *lock;
+ int i;
+
+ /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+ LIBCFS_ALLOC(pcl, sizeof(*pcl));
+ if (pcl == NULL)
+ return NULL;
+
+ pcl->pcl_cptab = cptab;
+ pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+ if (pcl->pcl_locks == NULL) {
+ LIBCFS_FREE(pcl, sizeof(*pcl));
+ return NULL;
+ }
+
+ cfs_percpt_for_each(lock, i, pcl->pcl_locks)
+ spin_lock_init(lock);
+
+ return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_alloc);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ * hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ * exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+{
+ int ncpt = cfs_cpt_number(pcl->pcl_cptab);
+ int i;
+
+ LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+ if (ncpt == 1) {
+ index = 0;
+ } else { /* serialize with exclusive lock */
+ while (pcl->pcl_locked)
+ cpu_relax();
+ }
+
+ if (likely(index != CFS_PERCPT_LOCK_EX)) {
+ spin_lock(pcl->pcl_locks[index]);
+ return;
+ }
+
+ /* exclusive lock request */
+ for (i = 0; i < ncpt; i++) {
+ spin_lock(pcl->pcl_locks[i]);
+ if (i == 0) {
+ LASSERT(!pcl->pcl_locked);
+ /* nobody should take private lock after this
+ * so I wouldn't starve for too long time */
+ pcl->pcl_locked = 1;
+ }
+ }
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+{
+ int ncpt = cfs_cpt_number(pcl->pcl_cptab);
+ int i;
+
+ index = ncpt == 1 ? 0 : index;
+
+ if (likely(index != CFS_PERCPT_LOCK_EX)) {
+ spin_unlock(pcl->pcl_locks[index]);
+ return;
+ }
+
+ for (i = ncpt - 1; i >= 0; i--) {
+ if (i == 0) {
+ LASSERT(pcl->pcl_locked);
+ pcl->pcl_locked = 0;
+ }
+ spin_unlock(pcl->pcl_locks[i]);
+ }
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
+
+
+/** free cpu-partition refcount */
+void
+cfs_percpt_atomic_free(atomic_t **refs)
+{
+ cfs_percpt_free(refs);
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_free);
+
+/** allocate cpu-partition refcount with initial value @init_val */
+atomic_t **
+cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val)
+{
+ atomic_t **refs;
+ atomic_t *ref;
+ int i;
+
+ refs = cfs_percpt_alloc(cptab, sizeof(*ref));
+ if (refs == NULL)
+ return NULL;
+
+ cfs_percpt_for_each(ref, i, refs)
+ atomic_set(ref, init_val);
+ return refs;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_alloc);
+
+/** return sum of cpu-partition refs */
+int
+cfs_percpt_atomic_summary(atomic_t **refs)
+{
+ atomic_t *ref;
+ int i;
+ int val = 0;
+
+ cfs_percpt_for_each(ref, i, refs)
+ val += atomic_read(ref);
+
+ return val;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_summary);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
new file mode 100644
index 000000000..1debdda72
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+struct cfs_var_array {
+ unsigned int va_count; /* # of buffers */
+ unsigned int va_size; /* size of each var */
+ struct cfs_cpt_table *va_cptab; /* cpu partition table */
+ void *va_ptrs[0]; /* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+ struct cfs_var_array *arr;
+ int i;
+
+ arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+ for (i = 0; i < arr->va_count; i++) {
+ if (arr->va_ptrs[i] != NULL)
+ LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+ }
+
+ LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+ va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ * arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ * then caller can access memory block for CPU 0 by arr[0],
+ * memory block for CPU 1 by arr[1]...
+ * memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+ struct cfs_var_array *arr;
+ int count;
+ int i;
+
+ count = cfs_cpt_number(cptab);
+
+ LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+ if (arr == NULL)
+ return NULL;
+
+ arr->va_size = size = L1_CACHE_ALIGN(size);
+ arr->va_count = count;
+ arr->va_cptab = cptab;
+
+ for (i = 0; i < count; i++) {
+ LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+ if (arr->va_ptrs[i] == NULL) {
+ cfs_percpt_free((void *)&arr->va_ptrs[0]);
+ return NULL;
+ }
+ }
+
+ return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+ struct cfs_var_array *arr;
+
+ arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+ return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * return memory block shadowed from current CPU
+ */
+void *
+cfs_percpt_current(void *vars)
+{
+ struct cfs_var_array *arr;
+ int cpt;
+
+ arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+ cpt = cfs_cpt_current(arr->va_cptab, 0);
+ if (cpt < 0)
+ return NULL;
+
+ return arr->va_ptrs[cpt];
+}
+EXPORT_SYMBOL(cfs_percpt_current);
+
+void *
+cfs_percpt_index(void *vars, int idx)
+{
+ struct cfs_var_array *arr;
+
+ arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+ LASSERT(idx >= 0 && idx < arr->va_count);
+ return arr->va_ptrs[idx];
+}
+EXPORT_SYMBOL(cfs_percpt_index);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+ struct cfs_var_array *arr;
+ int i;
+
+ arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+ for (i = 0; i < arr->va_count; i++) {
+ if (arr->va_ptrs[i] == NULL)
+ continue;
+
+ LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+ }
+ LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+ va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+ struct cfs_var_array *arr;
+ int i;
+
+ LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+ if (arr == NULL)
+ return NULL;
+
+ arr->va_count = count;
+ arr->va_size = size;
+
+ for (i = 0; i < count; i++) {
+ LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+ if (arr->va_ptrs[i] == NULL) {
+ cfs_array_free((void *)&arr->va_ptrs[0]);
+ return NULL;
+ }
+ }
+
+ return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
new file mode 100644
index 000000000..76d4392bd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
@@ -0,0 +1,562 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+ int *oldmask, int minmask, int allmask)
+{
+ const char *debugstr;
+ char op = '\0';
+ int newmask = minmask, i, len, found = 0;
+
+ /* <str> must be a list of tokens separated by whitespace
+ * and optionally an operator ('+' or '-'). If an operator
+ * appears first in <str>, '*oldmask' is used as the starting point
+ * (relative), otherwise minmask is used (absolute). An operator
+ * applies to all following tokens up to the next operator. */
+ while (*str != '\0') {
+ while (isspace(*str))
+ str++;
+ if (*str == '\0')
+ break;
+ if (*str == '+' || *str == '-') {
+ op = *str++;
+ if (!found)
+ /* only if first token is relative */
+ newmask = *oldmask;
+ while (isspace(*str))
+ str++;
+ if (*str == '\0') /* trailing op */
+ return -EINVAL;
+ }
+
+ /* find token length */
+ len = 0;
+ while (str[len] != '\0' && !isspace(str[len]) &&
+ str[len] != '+' && str[len] != '-')
+ len++;
+
+ /* match token */
+ found = 0;
+ for (i = 0; i < 32; i++) {
+ debugstr = bit2str(i);
+ if (debugstr != NULL &&
+ strlen(debugstr) == len &&
+ strncasecmp(str, debugstr, len) == 0) {
+ if (op == '-')
+ newmask &= ~(1 << i);
+ else
+ newmask |= (1 << i);
+ found = 1;
+ break;
+ }
+ }
+ if (!found && len == 3 &&
+ (strncasecmp(str, "ALL", len) == 0)) {
+ if (op == '-')
+ newmask = minmask;
+ else
+ newmask = allmask;
+ found = 1;
+ }
+ if (!found) {
+ CWARN("unknown mask '%.*s'.\n"
+ "mask usage: [+|-]<all|type> ...\n", len, str);
+ return -EINVAL;
+ }
+ str += len;
+ }
+
+ *oldmask = newmask;
+ return 0;
+}
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+ size_t i = 0;
+ char *end;
+
+ /* trim leading spaces */
+ while (i < size && *str && isspace(*str)) {
+ ++i;
+ ++str;
+ }
+
+ /* string with all spaces */
+ if (*str == '\0')
+ goto out;
+
+ end = str;
+ while (i < size && *end != '\0' && !isspace(*end)) {
+ ++i;
+ ++end;
+ }
+
+ *end = '\0';
+out:
+ return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+ char *end;
+
+ while (isspace(*str))
+ str++;
+
+ end = str + strlen(str);
+ while (end > str) {
+ if (!isspace(end[-1]))
+ break;
+ end--;
+ }
+
+ *end = 0;
+ return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+ char *end;
+
+ if (next->ls_str == NULL)
+ return 0;
+
+ /* skip leading white spaces */
+ while (next->ls_len) {
+ if (!isspace(*next->ls_str))
+ break;
+ next->ls_str++;
+ next->ls_len--;
+ }
+
+ if (next->ls_len == 0) /* whitespaces only */
+ return 0;
+
+ if (*next->ls_str == delim) {
+ /* first non-writespace is the delimiter */
+ return 0;
+ }
+
+ res->ls_str = next->ls_str;
+ end = memchr(next->ls_str, delim, next->ls_len);
+ if (end == NULL) {
+ /* there is no the delimeter in the string */
+ end = next->ls_str + next->ls_len;
+ next->ls_str = NULL;
+ } else {
+ next->ls_str = end + 1;
+ next->ls_len -= (end - res->ls_str + 1);
+ }
+
+ /* skip ending whitespaces */
+ while (--end != res->ls_str) {
+ if (!isspace(*end))
+ break;
+ }
+
+ res->ls_len = end - res->ls_str + 1;
+ return 1;
+}
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+ unsigned min, unsigned max)
+{
+ char *endp;
+
+ str = cfs_trimwhite(str);
+ *num = strtoul(str, &endp, 0);
+ if (endp == str)
+ return 0;
+
+ for (; endp < str + nob; endp++) {
+ if (!isspace(*endp))
+ return 0;
+ }
+
+ return (*num >= min && *num <= max);
+}
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+static int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+ int bracketed, struct cfs_range_expr **expr)
+{
+ struct cfs_range_expr *re;
+ struct cfs_lstr tok;
+
+ LIBCFS_ALLOC(re, sizeof(*re));
+ if (re == NULL)
+ return -ENOMEM;
+
+ if (src->ls_len == 1 && src->ls_str[0] == '*') {
+ re->re_lo = min;
+ re->re_hi = max;
+ re->re_stride = 1;
+ goto out;
+ }
+
+ if (cfs_str2num_check(src->ls_str, src->ls_len,
+ &re->re_lo, min, max)) {
+ /* <number> is parsed */
+ re->re_hi = re->re_lo;
+ re->re_stride = 1;
+ goto out;
+ }
+
+ if (!bracketed || !cfs_gettok(src, '-', &tok))
+ goto failed;
+
+ if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+ &re->re_lo, min, max))
+ goto failed;
+
+ /* <number> - */
+ if (cfs_str2num_check(src->ls_str, src->ls_len,
+ &re->re_hi, min, max)) {
+ /* <number> - <number> is parsed */
+ re->re_stride = 1;
+ goto out;
+ }
+
+ /* go to check <number> '-' <number> '/' <number> */
+ if (cfs_gettok(src, '/', &tok)) {
+ if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+ &re->re_hi, min, max))
+ goto failed;
+
+ /* <number> - <number> / ... */
+ if (cfs_str2num_check(src->ls_str, src->ls_len,
+ &re->re_stride, min, max)) {
+ /* <number> - <number> / <number> is parsed */
+ goto out;
+ }
+ }
+
+ out:
+ *expr = re;
+ return 0;
+
+ failed:
+ LIBCFS_FREE(re, sizeof(*re));
+ return -EINVAL;
+}
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+ struct cfs_range_expr *expr;
+
+ list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+ if (value >= expr->re_lo && value <= expr->re_hi &&
+ ((value - expr->re_lo) % expr->re_stride) == 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+ struct cfs_range_expr *expr;
+ __u32 *val;
+ int count = 0;
+ int i;
+
+ list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+ for (i = expr->re_lo; i <= expr->re_hi; i++) {
+ if (((i - expr->re_lo) % expr->re_stride) == 0)
+ count++;
+ }
+ }
+
+ if (count == 0) /* empty expression list */
+ return 0;
+
+ if (count > max) {
+ CERROR("Number of values %d exceeds max allowed %d\n",
+ max, count);
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+ if (val == NULL)
+ return -ENOMEM;
+
+ count = 0;
+ list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+ for (i = expr->re_lo; i <= expr->re_hi; i++) {
+ if (((i - expr->re_lo) % expr->re_stride) == 0)
+ val[count++] = i;
+ }
+ }
+
+ *valpp = val;
+ return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+ while (!list_empty(&expr_list->el_exprs)) {
+ struct cfs_range_expr *expr;
+
+ expr = list_entry(expr_list->el_exprs.next,
+ struct cfs_range_expr, re_link),
+ list_del(&expr->re_link);
+ LIBCFS_FREE(expr, sizeof(*expr));
+ }
+
+ LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 1 if \a str parses to \<number\> | \<expr_list\>
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+ struct cfs_expr_list **elpp)
+{
+ struct cfs_expr_list *expr_list;
+ struct cfs_range_expr *expr;
+ struct cfs_lstr src;
+ int rc;
+
+ LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+ if (expr_list == NULL)
+ return -ENOMEM;
+
+ src.ls_str = str;
+ src.ls_len = len;
+
+ INIT_LIST_HEAD(&expr_list->el_exprs);
+
+ if (src.ls_str[0] == '[' &&
+ src.ls_str[src.ls_len - 1] == ']') {
+ src.ls_str++;
+ src.ls_len -= 2;
+
+ rc = -EINVAL;
+ while (src.ls_str != NULL) {
+ struct cfs_lstr tok;
+
+ if (!cfs_gettok(&src, ',', &tok)) {
+ rc = -EINVAL;
+ break;
+ }
+
+ rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+ if (rc != 0)
+ break;
+
+ list_add_tail(&expr->re_link,
+ &expr_list->el_exprs);
+ }
+ } else {
+ rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+ if (rc == 0) {
+ list_add_tail(&expr->re_link,
+ &expr_list->el_exprs);
+ }
+ }
+
+ if (rc != 0)
+ cfs_expr_list_free(expr_list);
+ else
+ *elpp = expr_list;
+
+ return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+ struct cfs_expr_list *el;
+
+ while (!list_empty(list)) {
+ el = list_entry(list->next,
+ struct cfs_expr_list, el_link);
+ list_del(&el->el_link);
+ cfs_expr_list_free(el);
+ }
+}
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+ struct cfs_expr_list *el;
+ struct cfs_lstr src;
+ int rc;
+ int i;
+
+ src.ls_str = str;
+ src.ls_len = len;
+ i = 0;
+
+ while (src.ls_str != NULL) {
+ struct cfs_lstr res;
+
+ if (!cfs_gettok(&src, '.', &res)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+ if (rc != 0)
+ goto out;
+
+ list_add_tail(&el->el_link, list);
+ i++;
+ }
+
+ if (i == 4)
+ return 0;
+
+ rc = -EINVAL;
+ out:
+ cfs_expr_list_free_list(list);
+
+ return rc;
+}
+EXPORT_SYMBOL(cfs_ip_addr_parse);
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+ struct cfs_expr_list *el;
+ int i = 0;
+
+ list_for_each_entry_reverse(el, list, el_link) {
+ if (!cfs_expr_list_match(addr & 0xff, el))
+ return 0;
+ addr >>= 8;
+ i++;
+ }
+
+ return i == 4;
+}
+EXPORT_SYMBOL(cfs_ip_addr_match);
+
+void
+cfs_ip_addr_free(struct list_head *list)
+{
+ cfs_expr_list_free_list(list);
+}
+EXPORT_SYMBOL(cfs_ip_addr_free);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
new file mode 100644
index 000000000..cc3ab3519
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -0,0 +1,1056 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ * 0 : estimate best value based on cores or NUMA nodes
+ * 1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int cpu_npartitions;
+module_param(cpu_npartitions, int, 0444);
+MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ * number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ * are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char *cpu_pattern = "";
+module_param(cpu_pattern, charp, 0444);
+MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
+
+struct cfs_cpt_data {
+ /* serialize hotplug etc */
+ spinlock_t cpt_lock;
+ /* reserved for hotplug */
+ unsigned long cpt_version;
+ /* mutex to protect cpt_cpumask */
+ struct mutex cpt_mutex;
+ /* scratch buffer for set/unset_node */
+ cpumask_t *cpt_cpumask;
+};
+
+static struct cfs_cpt_data cpt_data;
+
+static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
+{
+ /* return cpumask of cores in the same socket */
+ cpumask_copy(mask, topology_core_cpumask(cpu));
+}
+
+/* return cpumask of HTs in the same core */
+static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
+{
+ cpumask_copy(mask, topology_thread_cpumask(cpu));
+}
+
+static void cfs_node_to_cpumask(int node, cpumask_t *mask)
+{
+ cpumask_copy(mask, cpumask_of_node(node));
+}
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+ int i;
+
+ if (cptab->ctb_cpu2cpt != NULL) {
+ LIBCFS_FREE(cptab->ctb_cpu2cpt,
+ num_possible_cpus() *
+ sizeof(cptab->ctb_cpu2cpt[0]));
+ }
+
+ for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+ struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+ if (part->cpt_nodemask != NULL) {
+ LIBCFS_FREE(part->cpt_nodemask,
+ sizeof(*part->cpt_nodemask));
+ }
+
+ if (part->cpt_cpumask != NULL)
+ LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+ }
+
+ if (cptab->ctb_parts != NULL) {
+ LIBCFS_FREE(cptab->ctb_parts,
+ cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+ }
+
+ if (cptab->ctb_nodemask != NULL)
+ LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+ if (cptab->ctb_cpumask != NULL)
+ LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+ LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+ struct cfs_cpt_table *cptab;
+ int i;
+
+ LIBCFS_ALLOC(cptab, sizeof(*cptab));
+ if (cptab == NULL)
+ return NULL;
+
+ cptab->ctb_nparts = ncpt;
+
+ LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+ LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+ if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+ goto failed;
+
+ LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+ num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+ if (cptab->ctb_cpu2cpt == NULL)
+ goto failed;
+
+ memset(cptab->ctb_cpu2cpt, -1,
+ num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+
+ LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+ if (cptab->ctb_parts == NULL)
+ goto failed;
+
+ for (i = 0; i < ncpt; i++) {
+ struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+ LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+ LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+ if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+ goto failed;
+ }
+
+ spin_lock(&cpt_data.cpt_lock);
+ /* Reserved for hotplug */
+ cptab->ctb_version = cpt_data.cpt_version;
+ spin_unlock(&cpt_data.cpt_lock);
+
+ return cptab;
+
+ failed:
+ cfs_cpt_table_free(cptab);
+ return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+ char *tmp = buf;
+ int rc = 0;
+ int i;
+ int j;
+
+ for (i = 0; i < cptab->ctb_nparts; i++) {
+ if (len > 0) {
+ rc = snprintf(tmp, len, "%d\t: ", i);
+ len -= rc;
+ }
+
+ if (len <= 0) {
+ rc = -EFBIG;
+ goto out;
+ }
+
+ tmp += rc;
+ for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
+ rc = snprintf(tmp, len, "%d ", j);
+ len -= rc;
+ if (len <= 0) {
+ rc = -EFBIG;
+ goto out;
+ }
+ tmp += rc;
+ }
+
+ *tmp = '\n';
+ tmp++;
+ len--;
+ }
+
+ out:
+ if (rc < 0)
+ return rc;
+
+ return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+ return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+ LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+ return cpt == CFS_CPT_ANY ?
+ cpumask_weight(cptab->ctb_cpumask) :
+ cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+ LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+ return cpt == CFS_CPT_ANY ?
+ cpumask_any_and(cptab->ctb_cpumask,
+ cpu_online_mask) < nr_cpu_ids :
+ cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
+ cpu_online_mask) < nr_cpu_ids;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+ LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+ return cpt == CFS_CPT_ANY ?
+ cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *
+cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+ LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+ return cpt == CFS_CPT_ANY ?
+ cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+ int node;
+
+ LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+ if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+ return 0;
+ }
+
+ if (cptab->ctb_cpu2cpt[cpu] != -1) {
+ CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+ cpu, cptab->ctb_cpu2cpt[cpu]);
+ return 0;
+ }
+
+ cptab->ctb_cpu2cpt[cpu] = cpt;
+
+ LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+ LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+
+ cpumask_set_cpu(cpu, cptab->ctb_cpumask);
+ cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+
+ node = cpu_to_node(cpu);
+
+ /* first CPU of @node in this CPT table */
+ if (!node_isset(node, *cptab->ctb_nodemask))
+ node_set(node, *cptab->ctb_nodemask);
+
+ /* first CPU of @node in this partition */
+ if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
+ node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+ int node;
+ int i;
+
+ LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+ if (cpu < 0 || cpu >= nr_cpu_ids) {
+ CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+ return;
+ }
+
+ if (cpt == CFS_CPT_ANY) {
+ /* caller doesn't know the partition ID */
+ cpt = cptab->ctb_cpu2cpt[cpu];
+ if (cpt < 0) { /* not set in this CPT-table */
+ CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
+ cpt, cptab);
+ return;
+ }
+
+ } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+ CDEBUG(D_INFO,
+ "CPU %d is not in cpu-partition %d\n", cpu, cpt);
+ return;
+ }
+
+ LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
+ LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
+
+ cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
+ cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
+ cptab->ctb_cpu2cpt[cpu] = -1;
+
+ node = cpu_to_node(cpu);
+
+ LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
+ LASSERT(node_isset(node, *cptab->ctb_nodemask));
+
+ for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
+ /* this CPT has other CPU belonging to this node? */
+ if (cpu_to_node(i) == node)
+ break;
+ }
+
+ if (i >= nr_cpu_ids)
+ node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+ for_each_cpu(i, cptab->ctb_cpumask) {
+ /* this CPT-table has other CPU belonging to this node? */
+ if (cpu_to_node(i) == node)
+ break;
+ }
+
+ if (i >= nr_cpu_ids)
+ node_clear(node, *cptab->ctb_nodemask);
+
+ return;
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+ int i;
+
+ if (cpumask_weight(mask) == 0 ||
+ cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
+ CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
+ cpt);
+ return 0;
+ }
+
+ for_each_cpu(i, mask) {
+ if (!cfs_cpt_set_cpu(cptab, cpt, i))
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+ int i;
+
+ for_each_cpu(i, mask)
+ cfs_cpt_unset_cpu(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+ cpumask_t *mask;
+ int rc;
+
+ if (node < 0 || node >= MAX_NUMNODES) {
+ CDEBUG(D_INFO,
+ "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+ return 0;
+ }
+
+ mutex_lock(&cpt_data.cpt_mutex);
+
+ mask = cpt_data.cpt_cpumask;
+ cfs_node_to_cpumask(node, mask);
+
+ rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
+
+ mutex_unlock(&cpt_data.cpt_mutex);
+
+ return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+ cpumask_t *mask;
+
+ if (node < 0 || node >= MAX_NUMNODES) {
+ CDEBUG(D_INFO,
+ "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+ return;
+ }
+
+ mutex_lock(&cpt_data.cpt_mutex);
+
+ mask = cpt_data.cpt_cpumask;
+ cfs_node_to_cpumask(node, mask);
+
+ cfs_cpt_unset_cpumask(cptab, cpt, mask);
+
+ mutex_unlock(&cpt_data.cpt_mutex);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+ int i;
+
+ for_each_node_mask(i, *mask) {
+ if (!cfs_cpt_set_node(cptab, cpt, i))
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+ int i;
+
+ for_each_node_mask(i, *mask)
+ cfs_cpt_unset_node(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+ int last;
+ int i;
+
+ if (cpt == CFS_CPT_ANY) {
+ last = cptab->ctb_nparts - 1;
+ cpt = 0;
+ } else {
+ last = cpt;
+ }
+
+ for (; cpt <= last; cpt++) {
+ for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask)
+ cfs_cpt_unset_cpu(cptab, cpt, i);
+ }
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+ nodemask_t *mask;
+ int weight;
+ int rotor;
+ int node;
+
+ /* convert CPU partition ID to HW node id */
+
+ if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+ mask = cptab->ctb_nodemask;
+ rotor = cptab->ctb_spread_rotor++;
+ } else {
+ mask = cptab->ctb_parts[cpt].cpt_nodemask;
+ rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+ }
+
+ weight = nodes_weight(*mask);
+ LASSERT(weight > 0);
+
+ rotor %= weight;
+
+ for_each_node_mask(node, *mask) {
+ if (rotor-- == 0)
+ return node;
+ }
+
+ LBUG();
+ return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+ int cpu = smp_processor_id();
+ int cpt = cptab->ctb_cpu2cpt[cpu];
+
+ if (cpt < 0) {
+ if (!remap)
+ return cpt;
+
+ /* don't return negative value for safety of upper layer,
+ * instead we shadow the unknown cpu to a valid partition ID */
+ cpt = cpu % cptab->ctb_nparts;
+ }
+
+ return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+ LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
+
+ return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+ cpumask_t *cpumask;
+ nodemask_t *nodemask;
+ int rc;
+ int i;
+
+ LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+ if (cpt == CFS_CPT_ANY) {
+ cpumask = cptab->ctb_cpumask;
+ nodemask = cptab->ctb_nodemask;
+ } else {
+ cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+ nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+ }
+
+ if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) {
+ CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
+ cpt);
+ return -EINVAL;
+ }
+
+ for_each_online_cpu(i) {
+ if (cpumask_test_cpu(i, cpumask))
+ continue;
+
+ rc = set_cpus_allowed_ptr(current, cpumask);
+ set_mems_allowed(*nodemask);
+ if (rc == 0)
+ schedule(); /* switch to allowed CPU */
+
+ return rc;
+ }
+
+ /* don't need to set affinity because all online CPUs are covered */
+ return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int
+cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+ cpumask_t *node, int number)
+{
+ cpumask_t *socket = NULL;
+ cpumask_t *core = NULL;
+ int rc = 0;
+ int cpu;
+
+ LASSERT(number > 0);
+
+ if (number >= cpumask_weight(node)) {
+ while (!cpumask_empty(node)) {
+ cpu = cpumask_first(node);
+
+ rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+ if (!rc)
+ return -EINVAL;
+ cpumask_clear_cpu(cpu, node);
+ }
+ return 0;
+ }
+
+ /* allocate scratch buffer */
+ LIBCFS_ALLOC(socket, cpumask_size());
+ LIBCFS_ALLOC(core, cpumask_size());
+ if (socket == NULL || core == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ while (!cpumask_empty(node)) {
+ cpu = cpumask_first(node);
+
+ /* get cpumask for cores in the same socket */
+ cfs_cpu_core_siblings(cpu, socket);
+ cpumask_and(socket, socket, node);
+
+ LASSERT(!cpumask_empty(socket));
+
+ while (!cpumask_empty(socket)) {
+ int i;
+
+ /* get cpumask for hts in the same core */
+ cfs_cpu_ht_siblings(cpu, core);
+ cpumask_and(core, core, node);
+
+ LASSERT(!cpumask_empty(core));
+
+ for_each_cpu(i, core) {
+ cpumask_clear_cpu(i, socket);
+ cpumask_clear_cpu(i, node);
+
+ rc = cfs_cpt_set_cpu(cptab, cpt, i);
+ if (!rc) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (--number == 0)
+ goto out;
+ }
+ cpu = cpumask_first(socket);
+ }
+ }
+
+ out:
+ if (socket != NULL)
+ LIBCFS_FREE(socket, cpumask_size());
+ if (core != NULL)
+ LIBCFS_FREE(core, cpumask_size());
+ return rc;
+}
+
+#define CPT_WEIGHT_MIN 4u
+
+static unsigned int
+cfs_cpt_num_estimate(void)
+{
+ unsigned nnode = num_online_nodes();
+ unsigned ncpu = num_online_cpus();
+ unsigned ncpt;
+
+ if (ncpu <= CPT_WEIGHT_MIN) {
+ ncpt = 1;
+ goto out;
+ }
+
+ /* generate reasonable number of CPU partitions based on total number
+ * of CPUs, Preferred N should be power2 and match this condition:
+ * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
+ for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+
+ if (ncpt <= nnode) { /* fat numa system */
+ while (nnode > ncpt)
+ nnode >>= 1;
+
+ } else { /* ncpt > nnode */
+ while ((nnode << 1) <= ncpt)
+ nnode <<= 1;
+ }
+
+ ncpt = nnode;
+
+ out:
+#if (BITS_PER_LONG == 32)
+ /* config many CPU partitions on 32-bit system could consume
+ * too much memory */
+ ncpt = min(2U, ncpt);
+#endif
+ while (ncpu % ncpt != 0)
+ ncpt--; /* worst case is 1 */
+
+ return ncpt;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create(int ncpt)
+{
+ struct cfs_cpt_table *cptab = NULL;
+ cpumask_t *mask = NULL;
+ int cpt = 0;
+ int num;
+ int rc;
+ int i;
+
+ rc = cfs_cpt_num_estimate();
+ if (ncpt <= 0)
+ ncpt = rc;
+
+ if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+ CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
+ ncpt, rc);
+ }
+
+ if (num_online_cpus() % ncpt != 0) {
+ CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
+ (int)num_online_cpus(), ncpt);
+ goto failed;
+ }
+
+ cptab = cfs_cpt_table_alloc(ncpt);
+ if (cptab == NULL) {
+ CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+ goto failed;
+ }
+
+ num = num_online_cpus() / ncpt;
+ if (num == 0) {
+ CERROR("CPU changed while setting CPU partition\n");
+ goto failed;
+ }
+
+ LIBCFS_ALLOC(mask, cpumask_size());
+ if (mask == NULL) {
+ CERROR("Failed to allocate scratch cpumask\n");
+ goto failed;
+ }
+
+ for_each_online_node(i) {
+ cfs_node_to_cpumask(i, mask);
+
+ while (!cpumask_empty(mask)) {
+ struct cfs_cpu_partition *part;
+ int n;
+
+ if (cpt >= ncpt)
+ goto failed;
+
+ part = &cptab->ctb_parts[cpt];
+
+ n = num - cpumask_weight(part->cpt_cpumask);
+ LASSERT(n > 0);
+
+ rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+ if (rc < 0)
+ goto failed;
+
+ LASSERT(num >= cpumask_weight(part->cpt_cpumask));
+ if (num == cpumask_weight(part->cpt_cpumask))
+ cpt++;
+ }
+ }
+
+ if (cpt != ncpt ||
+ num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
+ CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
+ cptab->ctb_nparts, num, cpt,
+ cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
+ goto failed;
+ }
+
+ LIBCFS_FREE(mask, cpumask_size());
+
+ return cptab;
+
+ failed:
+ CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
+ ncpt, num_online_nodes(), num_online_cpus());
+
+ if (mask != NULL)
+ LIBCFS_FREE(mask, cpumask_size());
+
+ if (cptab != NULL)
+ cfs_cpt_table_free(cptab);
+
+ return NULL;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create_pattern(char *pattern)
+{
+ struct cfs_cpt_table *cptab;
+ char *str = pattern;
+ int node = 0;
+ int high;
+ int ncpt;
+ int c;
+
+ for (ncpt = 0;; ncpt++) { /* quick scan bracket */
+ str = strchr(str, '[');
+ if (str == NULL)
+ break;
+ str++;
+ }
+
+ str = cfs_trimwhite(pattern);
+ if (*str == 'n' || *str == 'N') {
+ pattern = str + 1;
+ node = 1;
+ }
+
+ if (ncpt == 0 ||
+ (node && ncpt > num_online_nodes()) ||
+ (!node && ncpt > num_online_cpus())) {
+ CERROR("Invalid pattern %s, or too many partitions %d\n",
+ pattern, ncpt);
+ return NULL;
+ }
+
+ high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
+
+ cptab = cfs_cpt_table_alloc(ncpt);
+ if (cptab == NULL) {
+ CERROR("Failed to allocate cpu partition table\n");
+ return NULL;
+ }
+
+ for (str = cfs_trimwhite(pattern), c = 0;; c++) {
+ struct cfs_range_expr *range;
+ struct cfs_expr_list *el;
+ char *bracket = strchr(str, '[');
+ int cpt;
+ int rc;
+ int i;
+ int n;
+
+ if (bracket == NULL) {
+ if (*str != 0) {
+ CERROR("Invalid pattern %s\n", str);
+ goto failed;
+ } else if (c != ncpt) {
+ CERROR("expect %d partitions but found %d\n",
+ ncpt, c);
+ goto failed;
+ }
+ break;
+ }
+
+ if (sscanf(str, "%d%n", &cpt, &n) < 1) {
+ CERROR("Invalid cpu pattern %s\n", str);
+ goto failed;
+ }
+
+ if (cpt < 0 || cpt >= ncpt) {
+ CERROR("Invalid partition id %d, total partitions %d\n",
+ cpt, ncpt);
+ goto failed;
+ }
+
+ if (cfs_cpt_weight(cptab, cpt) != 0) {
+ CERROR("Partition %d has already been set.\n", cpt);
+ goto failed;
+ }
+
+ str = cfs_trimwhite(str + n);
+ if (str != bracket) {
+ CERROR("Invalid pattern %s\n", str);
+ goto failed;
+ }
+
+ bracket = strchr(str, ']');
+ if (bracket == NULL) {
+ CERROR("missing right bracket for cpt %d, %s\n",
+ cpt, str);
+ goto failed;
+ }
+
+ if (cfs_expr_list_parse(str, (bracket - str) + 1,
+ 0, high, &el) != 0) {
+ CERROR("Can't parse number range: %s\n", str);
+ goto failed;
+ }
+
+ list_for_each_entry(range, &el->el_exprs, re_link) {
+ for (i = range->re_lo; i <= range->re_hi; i++) {
+ if ((i - range->re_lo) % range->re_stride != 0)
+ continue;
+
+ rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
+ cfs_cpt_set_cpu(cptab, cpt, i);
+ if (!rc) {
+ cfs_expr_list_free(el);
+ goto failed;
+ }
+ }
+ }
+
+ cfs_expr_list_free(el);
+
+ if (!cfs_cpt_online(cptab, cpt)) {
+ CERROR("No online CPU is found on partition %d\n", cpt);
+ goto failed;
+ }
+
+ str = cfs_trimwhite(bracket + 1);
+ }
+
+ return cptab;
+
+ failed:
+ cfs_cpt_table_free(cptab);
+ return NULL;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ bool warn;
+
+ switch (action) {
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ spin_lock(&cpt_data.cpt_lock);
+ cpt_data.cpt_version++;
+ spin_unlock(&cpt_data.cpt_lock);
+ default:
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
+ CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
+ cpu, action);
+ break;
+ }
+
+ mutex_lock(&cpt_data.cpt_mutex);
+ /* if all HTs in a core are offline, it may break affinity */
+ cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
+ warn = cpumask_any_and(cpt_data.cpt_cpumask,
+ cpu_online_mask) >= nr_cpu_ids;
+ mutex_unlock(&cpt_data.cpt_mutex);
+ CDEBUG(warn ? D_WARNING : D_INFO,
+ "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
+ cpu, action);
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+ .notifier_call = cfs_cpu_notify,
+ .priority = 0
+};
+
+#endif
+
+void
+cfs_cpu_fini(void)
+{
+ if (cfs_cpt_table != NULL)
+ cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+ if (cpt_data.cpt_cpumask != NULL)
+ LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
+}
+
+int
+cfs_cpu_init(void)
+{
+ LASSERT(cfs_cpt_table == NULL);
+
+ memset(&cpt_data, 0, sizeof(cpt_data));
+
+ LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
+ if (cpt_data.cpt_cpumask == NULL) {
+ CERROR("Failed to allocate scratch buffer\n");
+ return -1;
+ }
+
+ spin_lock_init(&cpt_data.cpt_lock);
+ mutex_init(&cpt_data.cpt_mutex);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+
+ if (*cpu_pattern != 0) {
+ cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+ if (cfs_cpt_table == NULL) {
+ CERROR("Failed to create cptab from pattern %s\n",
+ cpu_pattern);
+ goto failed;
+ }
+
+ } else {
+ cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+ if (cfs_cpt_table == NULL) {
+ CERROR("Failed to create ptable with npartitions %d\n",
+ cpu_npartitions);
+ goto failed;
+ }
+ }
+
+ spin_lock(&cpt_data.cpt_lock);
+ if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
+ spin_unlock(&cpt_data.cpt_lock);
+ CERROR("CPU hotplug/unplug during setup\n");
+ goto failed;
+ }
+ spin_unlock(&cpt_data.cpt_lock);
+
+ LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
+ num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
+ return 0;
+
+ failed:
+ cfs_cpu_fini();
+ return -1;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
new file mode 100644
index 000000000..5e185fa59
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
@@ -0,0 +1,141 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+#include "linux-crypto.h"
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+static u32 __adler32(u32 cksum, unsigned char const *p, size_t len)
+{
+ return zlib_adler32(cksum, p, len);
+}
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = 1;
+
+ return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = *(u32 *)key;
+ return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *cksump = shash_desc_ctx(desc);
+
+ *cksump = *mctx;
+
+ return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *cksump = shash_desc_ctx(desc);
+
+ *cksump = __adler32(*cksump, data, len);
+ return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(u32 *)out = __adler32(*cksump, data, len);
+ return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *cksump = shash_desc_ctx(desc);
+
+ *(u32 *)out = *cksump;
+ return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+static struct shash_alg alg = {
+ .setkey = adler32_setkey,
+ .init = adler32_init,
+ .update = adler32_update,
+ .final = adler32_final,
+ .finup = adler32_finup,
+ .digest = adler32_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "adler32",
+ .cra_driver_name = "adler32-zlib",
+ .cra_priority = 100,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = adler32_cra_init,
+ }
+};
+
+
+int cfs_crypto_adler32_register(void)
+{
+ return crypto_register_shash(&alg);
+}
+
+void cfs_crypto_adler32_unregister(void)
+{
+ crypto_unregister_shash(&alg);
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
new file mode 100644
index 000000000..aa3fffed1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
@@ -0,0 +1,291 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "linux-crypto.h"
+/**
+ * Array of hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+
+
+static int cfs_crypto_hash_alloc(unsigned char alg_id,
+ const struct cfs_crypto_hash_type **type,
+ struct hash_desc *desc, unsigned char *key,
+ unsigned int key_len)
+{
+ int err = 0;
+
+ *type = cfs_crypto_hash_type(alg_id);
+
+ if (*type == NULL) {
+ CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+ alg_id, CFS_HASH_ALG_MAX);
+ return -EINVAL;
+ }
+ desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0);
+
+ if (desc->tfm == NULL)
+ return -EINVAL;
+
+ if (IS_ERR(desc->tfm)) {
+ CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+ (*type)->cht_name);
+ return PTR_ERR(desc->tfm);
+ }
+
+ desc->flags = 0;
+
+ /** Shash have different logic for initialization then digest
+ * shash: crypto_hash_setkey, crypto_hash_init
+ * digest: crypto_digest_init, crypto_digest_setkey
+ * Skip this function for digest, because we use shash logic at
+ * cfs_crypto_hash_alloc.
+ */
+ if (key != NULL) {
+ err = crypto_hash_setkey(desc->tfm, key, key_len);
+ } else if ((*type)->cht_key != 0) {
+ err = crypto_hash_setkey(desc->tfm,
+ (unsigned char *)&((*type)->cht_key),
+ (*type)->cht_size);
+ }
+
+ if (err != 0) {
+ crypto_free_hash(desc->tfm);
+ return err;
+ }
+
+ CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+ (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name,
+ (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name,
+ cfs_crypto_hash_speeds[alg_id]);
+
+ return crypto_hash_init(desc);
+}
+
+int cfs_crypto_hash_digest(unsigned char alg_id,
+ const void *buf, unsigned int buf_len,
+ unsigned char *key, unsigned int key_len,
+ unsigned char *hash, unsigned int *hash_len)
+{
+ struct scatterlist sl;
+ struct hash_desc hdesc;
+ int err;
+ const struct cfs_crypto_hash_type *type;
+
+ if (buf == NULL || buf_len == 0 || hash_len == NULL)
+ return -EINVAL;
+
+ err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len);
+ if (err != 0)
+ return err;
+
+ if (hash == NULL || *hash_len < type->cht_size) {
+ *hash_len = type->cht_size;
+ crypto_free_hash(hdesc.tfm);
+ return -ENOSPC;
+ }
+ sg_init_one(&sl, (void *)buf, buf_len);
+
+ hdesc.flags = 0;
+ err = crypto_hash_digest(&hdesc, &sl, sl.length, hash);
+ crypto_free_hash(hdesc.tfm);
+
+ return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+struct cfs_crypto_hash_desc *
+ cfs_crypto_hash_init(unsigned char alg_id,
+ unsigned char *key, unsigned int key_len)
+{
+
+ struct hash_desc *hdesc;
+ int err;
+ const struct cfs_crypto_hash_type *type;
+
+ hdesc = kmalloc(sizeof(*hdesc), 0);
+ if (hdesc == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len);
+
+ if (err) {
+ kfree(hdesc);
+ return ERR_PTR(err);
+ }
+ return (struct cfs_crypto_hash_desc *)hdesc;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+ struct page *page, unsigned int offset,
+ unsigned int len)
+{
+ struct scatterlist sl;
+
+ sg_init_table(&sl, 1);
+ sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK);
+
+ return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+ const void *buf, unsigned int buf_len)
+{
+ struct scatterlist sl;
+
+ sg_init_one(&sl, (void *)buf, buf_len);
+
+ return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/* If hash_len pointer is NULL - destroy descriptor. */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+ unsigned char *hash, unsigned int *hash_len)
+{
+ int err;
+ int size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm);
+
+ if (hash_len == NULL) {
+ crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+ kfree(hdesc);
+ return 0;
+ }
+ if (hash == NULL || *hash_len < size) {
+ *hash_len = size;
+ return -ENOSPC;
+ }
+ err = crypto_hash_final((struct hash_desc *) hdesc, hash);
+
+ if (err < 0) {
+ /* May be caller can fix error */
+ return err;
+ }
+ crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+ kfree(hdesc);
+ return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+static void cfs_crypto_performance_test(unsigned char alg_id,
+ const unsigned char *buf,
+ unsigned int buf_len)
+{
+ unsigned long start, end;
+ int bcount, err = 0;
+ int sec = 1; /* do test only 1 sec */
+ unsigned char hash[64];
+ unsigned int hash_len = 64;
+
+ for (start = jiffies, end = start + sec * HZ, bcount = 0;
+ time_before(jiffies, end); bcount++) {
+ err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0,
+ hash, &hash_len);
+ if (err)
+ break;
+
+ }
+ end = jiffies;
+
+ if (err) {
+ cfs_crypto_hash_speeds[alg_id] = -1;
+ CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n",
+ cfs_crypto_hash_name(alg_id), err);
+ } else {
+ unsigned long tmp;
+ tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+ 1000) / (1024 * 1024);
+ cfs_crypto_hash_speeds[alg_id] = (int)tmp;
+ }
+ CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n",
+ cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]);
+}
+
+int cfs_crypto_hash_speed(unsigned char hash_alg)
+{
+ if (hash_alg < CFS_HASH_ALG_MAX)
+ return cfs_crypto_hash_speeds[hash_alg];
+ else
+ return -1;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Do performance test for all hash algorithms.
+ */
+static int cfs_crypto_test_hashes(void)
+{
+ unsigned char i;
+ unsigned char *data;
+ unsigned int j;
+ /* Data block size for testing hash. Maximum
+ * kmalloc size for 2.6.18 kernel is 128K */
+ unsigned int data_len = 1 * 128 * 1024;
+
+ data = kmalloc(data_len, 0);
+ if (data == NULL) {
+ CERROR("Failed to allocate mem\n");
+ return -ENOMEM;
+ }
+
+ for (j = 0; j < data_len; j++)
+ data[j] = j & 0xff;
+
+ for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+ cfs_crypto_performance_test(i, data, data_len);
+
+ kfree(data);
+ return 0;
+}
+
+static int adler32;
+
+int cfs_crypto_register(void)
+{
+ request_module("crc32c");
+
+ adler32 = cfs_crypto_adler32_register();
+
+ /* check all algorithms and do performance test */
+ cfs_crypto_test_hashes();
+ return 0;
+}
+void cfs_crypto_unregister(void)
+{
+ if (adler32 == 0)
+ cfs_crypto_adler32_unregister();
+
+ return;
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h
new file mode 100644
index 000000000..18e8cd4d8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.h
@@ -0,0 +1,29 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
new file mode 100644
index 000000000..277f6b890
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
@@ -0,0 +1,111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+ struct cred *cred;
+
+ cred = prepare_creds();
+ if (cred) {
+ cap_raise(cred->cap_effective, cap);
+ commit_creds(cred);
+ }
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+ struct cred *cred;
+
+ cred = prepare_creds();
+ if (cred) {
+ cap_lower(cred->cap_effective, cap);
+ commit_creds(cred);
+ }
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+ return cap_raised(current_cap(), cap);
+}
+
+static void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+ /* XXX lost high byte */
+ *cap = kcap.cap[0];
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+ cfs_cap_t cap;
+ cfs_kernel_cap_pack(current_cap(), &cap);
+ return cap;
+}
+
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
new file mode 100644
index 000000000..4545d54f7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include "../tracefile.h"
+
+#include <linux/kallsyms.h>
+
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+ char *argv[3];
+ int rc;
+ char *envp[] = {
+ "HOME=/",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+ NULL};
+
+ argv[0] = lnet_debug_log_upcall;
+
+ LASSERTF(file != NULL, "called on a null filename\n");
+ argv[1] = file; /* only need to pass the path of the file */
+
+ argv[2] = NULL;
+
+ rc = call_usermodehelper(argv[0], argv, envp, 1);
+ if (rc < 0 && rc != -ENOENT) {
+ CERROR("Error %d invoking LNET debug log upcall %s %s; check /proc/sys/lnet/debug_log_upcall\n",
+ rc, argv[0], argv[1]);
+ } else {
+ CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+ argv[0], argv[1]);
+ }
+}
+
+void libcfs_run_upcall(char **argv)
+{
+ int rc;
+ int argc;
+ char *envp[] = {
+ "HOME=/",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+ NULL};
+
+ argv[0] = lnet_upcall;
+ argc = 1;
+ while (argv[argc] != NULL)
+ argc++;
+
+ LASSERT(argc >= 2);
+
+ rc = call_usermodehelper(argv[0], argv, envp, 1);
+ if (rc < 0 && rc != -ENOENT) {
+ CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; check /proc/sys/lnet/upcall\n",
+ rc, argv[0], argv[1],
+ argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+ argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+ argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+ argc < 6 ? "" : ",...");
+ } else {
+ CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
+ argv[0], argv[1],
+ argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+ argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+ argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+ argc < 6 ? "" : ",...");
+ }
+}
+
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata)
+{
+ char *argv[6];
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%d", msgdata->msg_line);
+
+ argv[1] = "LBUG";
+ argv[2] = (char *)msgdata->msg_file;
+ argv[3] = (char *)msgdata->msg_fn;
+ argv[4] = buf;
+ argv[5] = NULL;
+
+ libcfs_run_upcall (argv);
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+ libcfs_catastrophe = 1;
+ libcfs_debug_msg(msgdata, "LBUG\n");
+
+ if (in_interrupt()) {
+ panic("LBUG in interrupt.\n");
+ /* not reached */
+ }
+
+ dump_stack();
+ if (!libcfs_panic_on_lbug)
+ libcfs_debug_dumplog();
+ libcfs_run_lbug_upcall(msgdata);
+ if (libcfs_panic_on_lbug)
+ panic("LBUG");
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+ void *unused2)
+{
+ if (libcfs_panic_in_progress)
+ return 0;
+
+ libcfs_panic_in_progress = 1;
+ mb();
+
+ return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+ .notifier_call = panic_notifier,
+ .next = NULL,
+ .priority = 10000,
+};
+
+void libcfs_register_panic_notifier(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+ atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
new file mode 100644
index 000000000..e962f8968
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#define LNET_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+ struct libcfs_ioctl_hdr *hdr;
+ struct libcfs_ioctl_data *data;
+ int orig_len;
+
+ hdr = (struct libcfs_ioctl_hdr *)buf;
+ data = (struct libcfs_ioctl_data *)buf;
+
+ if (copy_from_user(buf, (void *)arg, sizeof(*hdr)))
+ return -EFAULT;
+
+ if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+ CERROR("PORTALS: version mismatch kernel vs application\n");
+ return -EINVAL;
+ }
+
+ if (hdr->ioc_len >= end - buf) {
+ CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+ return -EINVAL;
+ }
+
+
+ if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+ CERROR("PORTALS: user buffer too small for ioctl\n");
+ return -EINVAL;
+ }
+
+ orig_len = hdr->ioc_len;
+ if (copy_from_user(buf, (void *)arg, hdr->ioc_len))
+ return -EFAULT;
+ if (orig_len != data->ioc_len)
+ return -EINVAL;
+
+ if (libcfs_ioctl_is_invalid(data)) {
+ CERROR("PORTALS: ioctl not correctly formatted\n");
+ return -EINVAL;
+ }
+
+ if (data->ioc_inllen1)
+ data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+ if (data->ioc_inllen2)
+ data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+ cfs_size_round(data->ioc_inllen1);
+
+ return 0;
+}
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+ if (copy_to_user((char *)arg, data, size))
+ return -EFAULT;
+ return 0;
+}
+
+extern struct cfs_psdev_ops libcfs_psdev_ops;
+
+static int
+libcfs_psdev_open(struct inode *inode, struct file *file)
+{
+ struct libcfs_device_userstate **pdu = NULL;
+ int rc = 0;
+
+ if (!inode)
+ return -EINVAL;
+ pdu = (struct libcfs_device_userstate **)&file->private_data;
+ if (libcfs_psdev_ops.p_open != NULL)
+ rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+ else
+ return -EPERM;
+ return rc;
+}
+
+/* called when closing /dev/device */
+static int
+libcfs_psdev_release(struct inode *inode, struct file *file)
+{
+ struct libcfs_device_userstate *pdu;
+ int rc = 0;
+
+ if (!inode)
+ return -EINVAL;
+ pdu = file->private_data;
+ if (libcfs_psdev_ops.p_close != NULL)
+ rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+ else
+ rc = -EPERM;
+ return rc;
+}
+
+static long libcfs_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct cfs_psdev_file pfile;
+ int rc = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+ _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR ||
+ _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) {
+ CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+ _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+ return -EINVAL;
+ }
+
+ /* Handle platform-dependent IOC requests */
+ switch (cmd) {
+ case IOC_LIBCFS_PANIC:
+ if (!capable(CFS_CAP_SYS_BOOT))
+ return -EPERM;
+ panic("debugctl-invoked panic");
+ return 0;
+ case IOC_LIBCFS_MEMHOG:
+ if (!capable(CFS_CAP_SYS_ADMIN))
+ return -EPERM;
+ /* go thought */
+ }
+
+ pfile.off = 0;
+ pfile.private_data = file->private_data;
+ if (libcfs_psdev_ops.p_ioctl != NULL)
+ rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+ else
+ rc = -EPERM;
+ return rc;
+}
+
+static const struct file_operations libcfs_fops = {
+ .unlocked_ioctl = libcfs_ioctl,
+ .open = libcfs_psdev_open,
+ .release = libcfs_psdev_release,
+};
+
+struct miscdevice libcfs_dev = {
+ .minor = LNET_MINOR,
+ .name = "lnet",
+ .fops = &libcfs_fops,
+};
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
new file mode 100644
index 000000000..838f5f3bd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#if defined(CONFIG_KGDB)
+#include <linux/kgdb.h>
+#endif
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+void
+add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&waitq->lock, flags);
+ __add_wait_queue_exclusive(waitq, link);
+ spin_unlock_irqrestore(&waitq->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_head);
+
+void cfs_init_timer(struct timer_list *t)
+{
+ init_timer(t);
+}
+EXPORT_SYMBOL(cfs_init_timer);
+
+void cfs_timer_init(struct timer_list *t, cfs_timer_func_t *func, void *arg)
+{
+ init_timer(t);
+ t->function = func;
+ t->data = (unsigned long)arg;
+}
+EXPORT_SYMBOL(cfs_timer_init);
+
+void cfs_timer_done(struct timer_list *t)
+{
+ return;
+}
+EXPORT_SYMBOL(cfs_timer_done);
+
+void cfs_timer_arm(struct timer_list *t, unsigned long deadline)
+{
+ mod_timer(t, deadline);
+}
+EXPORT_SYMBOL(cfs_timer_arm);
+
+void cfs_timer_disarm(struct timer_list *t)
+{
+ del_timer(t);
+}
+EXPORT_SYMBOL(cfs_timer_disarm);
+
+int cfs_timer_is_armed(struct timer_list *t)
+{
+ return timer_pending(t);
+}
+EXPORT_SYMBOL(cfs_timer_is_armed);
+
+unsigned long cfs_timer_deadline(struct timer_list *t)
+{
+ return t->expires;
+}
+EXPORT_SYMBOL(cfs_timer_deadline);
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+ /* BREAKPOINT(); */
+#else
+ /* nothing */
+#endif
+}
+EXPORT_SYMBOL(cfs_enter_debugger);
+
+
+sigset_t
+cfs_block_allsigs(void)
+{
+ unsigned long flags;
+ sigset_t old;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ old = current->blocked;
+ sigfillset(&current->blocked);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+ return old;
+}
+EXPORT_SYMBOL(cfs_block_allsigs);
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+ unsigned long flags;
+ sigset_t old;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ old = current->blocked;
+ sigaddsetmask(&current->blocked, sigs);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ return old;
+}
+EXPORT_SYMBOL(cfs_block_sigs);
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+ unsigned long flags;
+ sigset_t old;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ old = current->blocked;
+ sigaddsetmask(&current->blocked, ~sigs);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+
+ return old;
+}
+EXPORT_SYMBOL(cfs_block_sigsinv);
+
+void
+cfs_restore_sigs(sigset_t old)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ current->blocked = old;
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_restore_sigs);
+
+int
+cfs_signal_pending(void)
+{
+ return signal_pending(current);
+}
+EXPORT_SYMBOL(cfs_signal_pending);
+
+void
+cfs_clear_sigpending(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ clear_tsk_thread_flag(current, TIF_SIGPENDING);
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+EXPORT_SYMBOL(cfs_clear_sigpending);
+
+int
+libcfs_arch_init(void)
+{
+ return 0;
+}
+EXPORT_SYMBOL(libcfs_arch_init);
+
+void
+libcfs_arch_cleanup(void)
+{
+ return;
+}
+EXPORT_SYMBOL(libcfs_arch_cleanup);
+
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
new file mode 100644
index 000000000..f2462e7f0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
@@ -0,0 +1,623 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../../include/linux/libcfs/libcfs.h"
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+
+static int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+ mm_segment_t oldmm = get_fs();
+ struct socket *sock;
+ int rc;
+ struct file *sock_filp;
+
+ rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+ if (rc != 0) {
+ CERROR ("Can't create socket: %d\n", rc);
+ return rc;
+ }
+
+ sock_filp = sock_alloc_file(sock, 0, NULL);
+ if (IS_ERR(sock_filp)) {
+ sock_release(sock);
+ rc = PTR_ERR(sock_filp);
+ goto out;
+ }
+
+ set_fs(KERNEL_DS);
+ if (sock_filp->f_op->unlocked_ioctl)
+ rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+ set_fs(oldmm);
+
+ fput(sock_filp);
+out:
+ return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+ struct ifreq ifr;
+ int nob;
+ int rc;
+ __u32 val;
+
+ nob = strnlen(name, IFNAMSIZ);
+ if (nob == IFNAMSIZ) {
+ CERROR("Interface name %s too long\n", name);
+ return -EINVAL;
+ }
+
+ CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+ strcpy(ifr.ifr_name, name);
+ rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+ if (rc != 0) {
+ CERROR("Can't get flags for interface %s\n", name);
+ return rc;
+ }
+
+ if ((ifr.ifr_flags & IFF_UP) == 0) {
+ CDEBUG(D_NET, "Interface %s down\n", name);
+ *up = 0;
+ *ip = *mask = 0;
+ return 0;
+ }
+
+ *up = 1;
+
+ strcpy(ifr.ifr_name, name);
+ ifr.ifr_addr.sa_family = AF_INET;
+ rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+ if (rc != 0) {
+ CERROR("Can't get IP address for interface %s\n", name);
+ return rc;
+ }
+
+ val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+ *ip = ntohl(val);
+
+ strcpy(ifr.ifr_name, name);
+ ifr.ifr_addr.sa_family = AF_INET;
+ rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+ if (rc != 0) {
+ CERROR("Can't get netmask for interface %s\n", name);
+ return rc;
+ }
+
+ val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+ *mask = ntohl(val);
+
+ return 0;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+ /* Allocate and fill in 'names', returning # interfaces/error */
+ char **names;
+ int toobig;
+ int nalloc;
+ int nfound;
+ struct ifreq *ifr;
+ struct ifconf ifc;
+ int rc;
+ int nob;
+ int i;
+
+
+ nalloc = 16; /* first guess at max interfaces */
+ toobig = 0;
+ for (;;) {
+ if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+ toobig = 1;
+ nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
+ CWARN("Too many interfaces: only enumerating first %d\n",
+ nalloc);
+ }
+
+ LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+ if (ifr == NULL) {
+ CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+ rc = -ENOMEM;
+ goto out0;
+ }
+
+ ifc.ifc_buf = (char *)ifr;
+ ifc.ifc_len = nalloc * sizeof(*ifr);
+
+ rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+ if (rc < 0) {
+ CERROR ("Error %d enumerating interfaces\n", rc);
+ goto out1;
+ }
+
+ LASSERT (rc == 0);
+
+ nfound = ifc.ifc_len/sizeof(*ifr);
+ LASSERT (nfound <= nalloc);
+
+ if (nfound < nalloc || toobig)
+ break;
+
+ LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ nalloc *= 2;
+ }
+
+ if (nfound == 0)
+ goto out1;
+
+ LIBCFS_ALLOC(names, nfound * sizeof(*names));
+ if (names == NULL) {
+ rc = -ENOMEM;
+ goto out1;
+ }
+
+ for (i = 0; i < nfound; i++) {
+
+ nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+ if (nob == IFNAMSIZ) {
+ /* no space for terminating NULL */
+ CERROR("interface name %.*s too long (%d max)\n",
+ nob, ifr[i].ifr_name, IFNAMSIZ);
+ rc = -ENAMETOOLONG;
+ goto out2;
+ }
+
+ LIBCFS_ALLOC(names[i], IFNAMSIZ);
+ if (names[i] == NULL) {
+ rc = -ENOMEM;
+ goto out2;
+ }
+
+ memcpy(names[i], ifr[i].ifr_name, nob);
+ names[i][nob] = 0;
+ }
+
+ *namesp = names;
+ rc = nfound;
+
+ out2:
+ if (rc < 0)
+ libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+ LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+ return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+ int i;
+
+ LASSERT (n > 0);
+
+ for (i = 0; i < n && names[i] != NULL; i++)
+ LIBCFS_FREE(names[i], IFNAMSIZ);
+
+ LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+ int rc;
+ long ticks = timeout * HZ;
+ unsigned long then;
+ struct timeval tv;
+
+ LASSERT (nob > 0);
+ /* Caller may pass a zero timeout if she thinks the socket buffer is
+ * empty enough to take the whole message immediately */
+
+ for (;;) {
+ struct kvec iov = {
+ .iov_base = buffer,
+ .iov_len = nob
+ };
+ struct msghdr msg = {
+ .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0
+ };
+
+ if (timeout != 0) {
+ /* Set send timeout to remaining time */
+ tv = (struct timeval) {
+ .tv_sec = ticks / HZ,
+ .tv_usec = ((ticks % HZ) * 1000000) / HZ
+ };
+ rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+ (char *)&tv, sizeof(tv));
+ if (rc != 0) {
+ CERROR("Can't set socket send timeout %ld.%06d: %d\n",
+ (long)tv.tv_sec, (int)tv.tv_usec, rc);
+ return rc;
+ }
+ }
+
+ then = jiffies;
+ rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
+ ticks -= jiffies - then;
+
+ if (rc == nob)
+ return 0;
+
+ if (rc < 0)
+ return rc;
+
+ if (rc == 0) {
+ CERROR ("Unexpected zero rc\n");
+ return -ECONNABORTED;
+ }
+
+ if (ticks <= 0)
+ return -EAGAIN;
+
+ buffer = ((char *)buffer) + rc;
+ nob -= rc;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+ int rc;
+ long ticks = timeout * HZ;
+ unsigned long then;
+ struct timeval tv;
+
+ LASSERT (nob > 0);
+ LASSERT (ticks > 0);
+
+ for (;;) {
+ struct kvec iov = {
+ .iov_base = buffer,
+ .iov_len = nob
+ };
+ struct msghdr msg = {
+ .msg_flags = 0
+ };
+
+ /* Set receive timeout to remaining time */
+ tv = (struct timeval) {
+ .tv_sec = ticks / HZ,
+ .tv_usec = ((ticks % HZ) * 1000000) / HZ
+ };
+ rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+ (char *)&tv, sizeof(tv));
+ if (rc != 0) {
+ CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+ (long)tv.tv_sec, (int)tv.tv_usec, rc);
+ return rc;
+ }
+
+ then = jiffies;
+ rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
+ ticks -= jiffies - then;
+
+ if (rc < 0)
+ return rc;
+
+ if (rc == 0)
+ return -ECONNRESET;
+
+ buffer = ((char *)buffer) + rc;
+ nob -= rc;
+
+ if (nob == 0)
+ return 0;
+
+ if (ticks <= 0)
+ return -ETIMEDOUT;
+ }
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+ __u32 local_ip, int local_port)
+{
+ struct sockaddr_in locaddr;
+ struct socket *sock;
+ int rc;
+ int option;
+
+ /* All errors are fatal except bind failure if the port is in use */
+ *fatal = 1;
+
+ rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+ *sockp = sock;
+ if (rc != 0) {
+ CERROR ("Can't create socket: %d\n", rc);
+ return rc;
+ }
+
+ option = 1;
+ rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+ (char *)&option, sizeof (option));
+ if (rc != 0) {
+ CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+ goto failed;
+ }
+
+ if (local_ip != 0 || local_port != 0) {
+ memset(&locaddr, 0, sizeof(locaddr));
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_port = htons(local_port);
+ locaddr.sin_addr.s_addr = (local_ip == 0) ?
+ INADDR_ANY : htonl(local_ip);
+
+ rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+ sizeof(locaddr));
+ if (rc == -EADDRINUSE) {
+ CDEBUG(D_NET, "Port %d already in use\n", local_port);
+ *fatal = 0;
+ goto failed;
+ }
+ if (rc != 0) {
+ CERROR("Error trying to bind to port %d: %d\n",
+ local_port, rc);
+ goto failed;
+ }
+ }
+
+ return 0;
+
+ failed:
+ sock_release(sock);
+ return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+ int option;
+ int rc;
+
+ if (txbufsize != 0) {
+ option = txbufsize;
+ rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+ (char *)&option, sizeof (option));
+ if (rc != 0) {
+ CERROR ("Can't set send buffer %d: %d\n",
+ option, rc);
+ return rc;
+ }
+ }
+
+ if (rxbufsize != 0) {
+ option = rxbufsize;
+ rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+ (char *)&option, sizeof (option));
+ if (rc != 0) {
+ CERROR ("Can't set receive buffer %d: %d\n",
+ option, rc);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+ struct sockaddr_in sin;
+ int len = sizeof (sin);
+ int rc;
+
+ rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+ remote ? 2 : 0);
+ if (rc != 0) {
+ CERROR ("Error %d getting sock %s IP/port\n",
+ rc, remote ? "peer" : "local");
+ return rc;
+ }
+
+ if (ip != NULL)
+ *ip = ntohl (sin.sin_addr.s_addr);
+
+ if (port != NULL)
+ *port = ntohs (sin.sin_port);
+
+ return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+ if (txbufsize != NULL) {
+ *txbufsize = sock->sk->sk_sndbuf;
+ }
+
+ if (rxbufsize != NULL) {
+ *rxbufsize = sock->sk->sk_rcvbuf;
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+ __u32 local_ip, int local_port, int backlog)
+{
+ int fatal;
+ int rc;
+
+ rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+ if (rc != 0) {
+ if (!fatal)
+ CERROR("Can't create socket: port %d already in use\n",
+ local_port);
+ return rc;
+ }
+
+ rc = (*sockp)->ops->listen(*sockp, backlog);
+ if (rc == 0)
+ return 0;
+
+ CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+ sock_release(*sockp);
+ return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+ wait_queue_t wait;
+ struct socket *newsock;
+ int rc;
+
+ init_waitqueue_entry(&wait, current);
+
+ /* XXX this should add a ref to sock->ops->owner, if
+ * TCP could be a module */
+ rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+ if (rc) {
+ CERROR("Can't allocate socket\n");
+ return rc;
+ }
+
+ newsock->ops = sock->ops;
+
+ rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+ if (rc == -EAGAIN) {
+ /* Nothing ready, so wait for activity */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sock->sk), &wait);
+ schedule();
+ remove_wait_queue(sk_sleep(sock->sk), &wait);
+ set_current_state(TASK_RUNNING);
+ rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+ }
+
+ if (rc != 0)
+ goto failed;
+
+ *newsockp = newsock;
+ return 0;
+
+ failed:
+ sock_release(newsock);
+ return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+ wake_up_all(sk_sleep(sock->sk));
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+ __u32 local_ip, int local_port,
+ __u32 peer_ip, int peer_port)
+{
+ struct sockaddr_in srvaddr;
+ int rc;
+
+ rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+ if (rc != 0)
+ return rc;
+
+ memset (&srvaddr, 0, sizeof (srvaddr));
+ srvaddr.sin_family = AF_INET;
+ srvaddr.sin_port = htons(peer_port);
+ srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+ rc = (*sockp)->ops->connect(*sockp,
+ (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+ 0);
+ if (rc == 0)
+ return 0;
+
+ /* EADDRNOTAVAIL probably means we're already connected to the same
+ * peer/port on the same local port on a differently typed
+ * connection. Let our caller retry with a different local
+ * port... */
+ *fatal = !(rc == -EADDRNOTAVAIL);
+
+ CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+ "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc,
+ &local_ip, local_port, &peer_ip, peer_port);
+
+ sock_release(*sockp);
+ return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+ sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
new file mode 100644
index 000000000..c8e293002
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+ 80, /* 80% pages for CFS_TCD_TYPE_PROC */
+ 10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+ 10 /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+struct rw_semaphore cfs_tracefile_sem;
+
+int cfs_tracefile_init_arch(void)
+{
+ int i;
+ int j;
+ struct cfs_trace_cpu_data *tcd;
+
+ init_rwsem(&cfs_tracefile_sem);
+
+ /* initialize trace_data */
+ memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+ for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+ cfs_trace_data[i] =
+ kmalloc(sizeof(union cfs_trace_data_union) *
+ num_possible_cpus(), GFP_KERNEL);
+ if (cfs_trace_data[i] == NULL)
+ goto out;
+
+ }
+
+ /* arch related info initialized */
+ cfs_tcd_for_each(tcd, i, j) {
+ spin_lock_init(&tcd->tcd_lock);
+ tcd->tcd_pages_factor = pages_factor[i];
+ tcd->tcd_type = i;
+ tcd->tcd_cpu = j;
+ }
+
+ for (i = 0; i < num_possible_cpus(); i++)
+ for (j = 0; j < 3; j++) {
+ cfs_trace_console_buffers[i][j] =
+ kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+ GFP_KERNEL);
+
+ if (cfs_trace_console_buffers[i][j] == NULL)
+ goto out;
+ }
+
+ return 0;
+
+out:
+ cfs_tracefile_fini_arch();
+ printk(KERN_ERR "lnet: Not enough memory\n");
+ return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch(void)
+{
+ int i;
+ int j;
+
+ for (i = 0; i < num_possible_cpus(); i++)
+ for (j = 0; j < 3; j++)
+ if (cfs_trace_console_buffers[i][j] != NULL) {
+ kfree(cfs_trace_console_buffers[i][j]);
+ cfs_trace_console_buffers[i][j] = NULL;
+ }
+
+ for (i = 0; cfs_trace_data[i] != NULL; i++) {
+ kfree(cfs_trace_data[i]);
+ cfs_trace_data[i] = NULL;
+ }
+}
+
+void cfs_tracefile_read_lock(void)
+{
+ down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock(void)
+{
+ up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock(void)
+{
+ down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock(void)
+{
+ up_write(&cfs_tracefile_sem);
+}
+
+cfs_trace_buf_type_t cfs_trace_buf_idx_get(void)
+{
+ if (in_irq())
+ return CFS_TCD_TYPE_IRQ;
+ else if (in_softirq())
+ return CFS_TCD_TYPE_SOFTIRQ;
+ else
+ return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+ __acquires(&tcd->tc_lock)
+{
+ __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+ if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+ spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+ else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+ spin_lock_bh(&tcd->tcd_lock);
+ else if (unlikely(walking))
+ spin_lock_irq(&tcd->tcd_lock);
+ else
+ spin_lock(&tcd->tcd_lock);
+ return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+ __releases(&tcd->tcd_lock)
+{
+ __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+ if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+ spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+ else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+ spin_unlock_bh(&tcd->tcd_lock);
+ else if (unlikely(walking))
+ spin_unlock_irq(&tcd->tcd_lock);
+ else
+ spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+ struct cfs_trace_page *tage)
+{
+ /*
+ * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+ * from here: this will lead to infinite recursion.
+ */
+ return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+ struct libcfs_debug_msg_data *msgdata,
+ unsigned long stack)
+{
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+
+ header->ph_subsys = msgdata->msg_subsys;
+ header->ph_mask = msgdata->msg_mask;
+ header->ph_cpu_id = smp_processor_id();
+ header->ph_type = cfs_trace_buf_idx_get();
+ header->ph_sec = (__u32)tv.tv_sec;
+ header->ph_usec = tv.tv_usec;
+ header->ph_stack = stack;
+ header->ph_pid = current->pid;
+ header->ph_line_num = msgdata->msg_line;
+ header->ph_extern_pid = 0;
+ return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+ switch (hdr->ph_subsys) {
+
+ case S_LND:
+ case S_LNET:
+ return "LNetError";
+ default:
+ return "LustreError";
+ }
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+ switch (hdr->ph_subsys) {
+
+ case S_LND:
+ case S_LNET:
+ return "LNet";
+ default:
+ return "Lustre";
+ }
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+ const char *buf, int len, const char *file,
+ const char *fn)
+{
+ char *prefix = "Lustre", *ptype = NULL;
+
+ if ((mask & D_EMERG) != 0) {
+ prefix = dbghdr_to_err_string(hdr);
+ ptype = KERN_EMERG;
+ } else if ((mask & D_ERROR) != 0) {
+ prefix = dbghdr_to_err_string(hdr);
+ ptype = KERN_ERR;
+ } else if ((mask & D_WARNING) != 0) {
+ prefix = dbghdr_to_info_string(hdr);
+ ptype = KERN_WARNING;
+ } else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+ prefix = dbghdr_to_info_string(hdr);
+ ptype = KERN_INFO;
+ }
+
+ if ((mask & D_CONSOLE) != 0) {
+ printk("%s%s: %.*s", ptype, prefix, len, buf);
+ } else {
+ printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+ hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+ fn, len, buf);
+ }
+ return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+ int total_mb = (totalram_pages >> (20 - PAGE_SHIFT));
+
+ return max(512, (total_mb * 80)/100);
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
new file mode 100644
index 000000000..ba84e4ffd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_TRACEFILE_H__
+#define __LIBCFS_LINUX_TRACEFILE_H__
+
+/**
+ * three types of trace_data in linux
+ */
+typedef enum {
+ CFS_TCD_TYPE_PROC = 0,
+ CFS_TCD_TYPE_SOFTIRQ,
+ CFS_TCD_TYPE_IRQ,
+ CFS_TCD_TYPE_MAX
+} cfs_trace_buf_type_t;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c
new file mode 100644
index 000000000..f0ee76abf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/module.c
@@ -0,0 +1,976 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/list.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <asm/div64.h>
+
+#include "../../include/linux/libcfs/libcfs_crypto.h"
+#include "../../include/linux/lnet/lib-lnet.h"
+#include "../../include/linux/lnet/lnet.h"
+#include "tracefile.h"
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+extern struct miscdevice libcfs_dev;
+extern struct rw_semaphore cfs_tracefile_sem;
+extern struct mutex cfs_trace_thread_mutex;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+extern void libcfs_init_nidstrings(void);
+
+static int insert_proc(void);
+static void remove_proc(void);
+
+static struct ctl_table_header *lnet_table_header;
+extern char lnet_upcall[1024];
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+#define CTL_LNET (0x100)
+
+enum {
+ PSDEV_DEBUG = 1, /* control debugging */
+ PSDEV_SUBSYSTEM_DEBUG, /* control debugging */
+ PSDEV_PRINTK, /* force all messages to console */
+ PSDEV_CONSOLE_RATELIMIT, /* ratelimit console messages */
+ PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */
+ PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */
+ PSDEV_CONSOLE_BACKOFF, /* delay increase factor */
+ PSDEV_DEBUG_PATH, /* crashdump log location */
+ PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */
+ PSDEV_CPT_TABLE, /* information about cpu partitions */
+ PSDEV_LNET_UPCALL, /* User mode upcall script */
+ PSDEV_LNET_MEMUSED, /* bytes currently PORTAL_ALLOCated */
+ PSDEV_LNET_CATASTROPHE, /* if we have LBUGged or panic'd */
+ PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */
+ PSDEV_LNET_DUMP_KERNEL, /* snapshot kernel debug buffer to file */
+ PSDEV_LNET_DAEMON_FILE, /* spool kernel debug buffer to file */
+ PSDEV_LNET_DEBUG_MB, /* size of debug buffer */
+ PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+ PSDEV_LNET_WATCHDOG_RATELIMIT, /* ratelimit watchdog messages */
+ PSDEV_LNET_FORCE_LBUG, /* hook to force an LBUG */
+ PSDEV_LNET_FAIL_LOC, /* control test failures instrumentation */
+ PSDEV_LNET_FAIL_VAL, /* userdata for fail loc */
+};
+
+static void kportal_memhog_free (struct libcfs_device_userstate *ldu)
+{
+ struct page **level0p = &ldu->ldu_memhog_root_page;
+ struct page **level1p;
+ struct page **level2p;
+ int count1;
+ int count2;
+
+ if (*level0p != NULL) {
+
+ level1p = (struct page **)page_address(*level0p);
+ count1 = 0;
+
+ while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+ *level1p != NULL) {
+
+ level2p = (struct page **)page_address(*level1p);
+ count2 = 0;
+
+ while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+ *level2p != NULL) {
+
+ __free_page(*level2p);
+ ldu->ldu_memhog_pages--;
+ level2p++;
+ count2++;
+ }
+
+ __free_page(*level1p);
+ ldu->ldu_memhog_pages--;
+ level1p++;
+ count1++;
+ }
+
+ __free_page(*level0p);
+ ldu->ldu_memhog_pages--;
+
+ *level0p = NULL;
+ }
+
+ LASSERT (ldu->ldu_memhog_pages == 0);
+}
+
+static int kportal_memhog_alloc(struct libcfs_device_userstate *ldu, int npages,
+ gfp_t flags)
+{
+ struct page **level0p;
+ struct page **level1p;
+ struct page **level2p;
+ int count1;
+ int count2;
+
+ LASSERT (ldu->ldu_memhog_pages == 0);
+ LASSERT (ldu->ldu_memhog_root_page == NULL);
+
+ if (npages < 0)
+ return -EINVAL;
+
+ if (npages == 0)
+ return 0;
+
+ level0p = &ldu->ldu_memhog_root_page;
+ *level0p = alloc_page(flags);
+ if (*level0p == NULL)
+ return -ENOMEM;
+ ldu->ldu_memhog_pages++;
+
+ level1p = (struct page **)page_address(*level0p);
+ count1 = 0;
+ memset(level1p, 0, PAGE_CACHE_SIZE);
+
+ while (ldu->ldu_memhog_pages < npages &&
+ count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+ if (cfs_signal_pending())
+ return -EINTR;
+
+ *level1p = alloc_page(flags);
+ if (*level1p == NULL)
+ return -ENOMEM;
+ ldu->ldu_memhog_pages++;
+
+ level2p = (struct page **)page_address(*level1p);
+ count2 = 0;
+ memset(level2p, 0, PAGE_CACHE_SIZE);
+
+ while (ldu->ldu_memhog_pages < npages &&
+ count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+ if (cfs_signal_pending())
+ return -EINTR;
+
+ *level2p = alloc_page(flags);
+ if (*level2p == NULL)
+ return -ENOMEM;
+ ldu->ldu_memhog_pages++;
+
+ level2p++;
+ count2++;
+ }
+
+ level1p++;
+ count1++;
+ }
+
+ return 0;
+}
+
+/* called when opening /dev/device */
+static int libcfs_psdev_open(unsigned long flags, void *args)
+{
+ struct libcfs_device_userstate *ldu;
+
+ try_module_get(THIS_MODULE);
+
+ LIBCFS_ALLOC(ldu, sizeof(*ldu));
+ if (ldu != NULL) {
+ ldu->ldu_memhog_pages = 0;
+ ldu->ldu_memhog_root_page = NULL;
+ }
+ *(struct libcfs_device_userstate **)args = ldu;
+
+ return 0;
+}
+
+/* called when closing /dev/device */
+static int libcfs_psdev_release(unsigned long flags, void *args)
+{
+ struct libcfs_device_userstate *ldu;
+
+ ldu = (struct libcfs_device_userstate *)args;
+ if (ldu != NULL) {
+ kportal_memhog_free(ldu);
+ LIBCFS_FREE(ldu, sizeof(*ldu));
+ }
+
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+static struct rw_semaphore ioctl_list_sem;
+static struct list_head ioctl_list;
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+ int rc = 0;
+
+ down_write(&ioctl_list_sem);
+ if (!list_empty(&hand->item))
+ rc = -EBUSY;
+ else
+ list_add_tail(&hand->item, &ioctl_list);
+ up_write(&ioctl_list_sem);
+
+ return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+ int rc = 0;
+
+ down_write(&ioctl_list_sem);
+ if (list_empty(&hand->item))
+ rc = -ENOENT;
+ else
+ list_del_init(&hand->item);
+ up_write(&ioctl_list_sem);
+
+ return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+static int libcfs_ioctl_int(struct cfs_psdev_file *pfile, unsigned long cmd,
+ void *arg, struct libcfs_ioctl_data *data)
+{
+ int err = -EINVAL;
+
+ switch (cmd) {
+ case IOC_LIBCFS_CLEAR_DEBUG:
+ libcfs_debug_clear_buffer();
+ return 0;
+ /*
+ * case IOC_LIBCFS_PANIC:
+ * Handled in arch/cfs_module.c
+ */
+ case IOC_LIBCFS_MARK_DEBUG:
+ if (data->ioc_inlbuf1 == NULL ||
+ data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+ return -EINVAL;
+ libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+ return 0;
+ case IOC_LIBCFS_MEMHOG:
+ if (pfile->private_data == NULL) {
+ err = -EINVAL;
+ } else {
+ kportal_memhog_free(pfile->private_data);
+ /* XXX The ioc_flags is not GFP flags now, need to be fixed */
+ err = kportal_memhog_alloc(pfile->private_data,
+ data->ioc_count,
+ data->ioc_flags);
+ if (err != 0)
+ kportal_memhog_free(pfile->private_data);
+ }
+ break;
+
+ case IOC_LIBCFS_PING_TEST: {
+ extern void (kping_client)(struct libcfs_ioctl_data *);
+ void (*ping)(struct libcfs_ioctl_data *);
+
+ CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+ data->ioc_count, libcfs_nid2str(data->ioc_nid),
+ libcfs_nid2str(data->ioc_nid));
+ ping = symbol_get(kping_client);
+ if (!ping)
+ CERROR("symbol_get failed\n");
+ else {
+ ping(data);
+ symbol_put(kping_client);
+ }
+ return 0;
+ }
+
+ default: {
+ struct libcfs_ioctl_handler *hand;
+ err = -EINVAL;
+ down_read(&ioctl_list_sem);
+ list_for_each_entry(hand, &ioctl_list, item) {
+ err = hand->handle_ioctl(cmd, data);
+ if (err != -EINVAL) {
+ if (err == 0)
+ err = libcfs_ioctl_popdata(arg,
+ data, sizeof (*data));
+ break;
+ }
+ }
+ up_read(&ioctl_list_sem);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg)
+{
+ char *buf;
+ struct libcfs_ioctl_data *data;
+ int err = 0;
+
+ LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* 'cmd' and permissions get checked in our arch-specific caller */
+ if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+ CERROR("PORTALS ioctl: data error\n");
+ err = -EINVAL;
+ goto out;
+ }
+ data = (struct libcfs_ioctl_data *)buf;
+
+ err = libcfs_ioctl_int(pfile, cmd, arg, data);
+
+out:
+ LIBCFS_FREE(buf, 1024);
+ return err;
+}
+
+
+struct cfs_psdev_ops libcfs_psdev_ops = {
+ libcfs_psdev_open,
+ libcfs_psdev_release,
+ NULL,
+ NULL,
+ libcfs_ioctl
+};
+
+static int init_libcfs_module(void)
+{
+ int rc;
+
+ libcfs_arch_init();
+ libcfs_init_nidstrings();
+ init_rwsem(&cfs_tracefile_sem);
+ mutex_init(&cfs_trace_thread_mutex);
+ init_rwsem(&ioctl_list_sem);
+ INIT_LIST_HEAD(&ioctl_list);
+ init_waitqueue_head(&cfs_race_waitq);
+
+ rc = libcfs_debug_init(5 * 1024 * 1024);
+ if (rc < 0) {
+ pr_err("LustreError: libcfs_debug_init: %d\n", rc);
+ return rc;
+ }
+
+ rc = cfs_cpu_init();
+ if (rc != 0)
+ goto cleanup_debug;
+
+ rc = misc_register(&libcfs_dev);
+ if (rc) {
+ CERROR("misc_register: error %d\n", rc);
+ goto cleanup_cpu;
+ }
+
+ rc = cfs_wi_startup();
+ if (rc) {
+ CERROR("initialize workitem: error %d\n", rc);
+ goto cleanup_deregister;
+ }
+
+ /* max to 4 threads, should be enough for rehash */
+ rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+ rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+ rc, &cfs_sched_rehash);
+ if (rc != 0) {
+ CERROR("Startup workitem scheduler: error: %d\n", rc);
+ goto cleanup_deregister;
+ }
+
+ rc = cfs_crypto_register();
+ if (rc) {
+ CERROR("cfs_crypto_register: error %d\n", rc);
+ goto cleanup_wi;
+ }
+
+
+ rc = insert_proc();
+ if (rc) {
+ CERROR("insert_proc: error %d\n", rc);
+ goto cleanup_crypto;
+ }
+
+ CDEBUG (D_OTHER, "portals setup OK\n");
+ return 0;
+ cleanup_crypto:
+ cfs_crypto_unregister();
+ cleanup_wi:
+ cfs_wi_shutdown();
+ cleanup_deregister:
+ misc_deregister(&libcfs_dev);
+cleanup_cpu:
+ cfs_cpu_fini();
+ cleanup_debug:
+ libcfs_debug_cleanup();
+ return rc;
+}
+
+static void exit_libcfs_module(void)
+{
+ int rc;
+
+ remove_proc();
+
+ CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ if (cfs_sched_rehash != NULL) {
+ cfs_wi_sched_destroy(cfs_sched_rehash);
+ cfs_sched_rehash = NULL;
+ }
+
+ cfs_crypto_unregister();
+ cfs_wi_shutdown();
+
+ rc = misc_deregister(&libcfs_dev);
+ if (rc)
+ CERROR("misc_deregister error %d\n", rc);
+
+ cfs_cpu_fini();
+
+ if (atomic_read(&libcfs_kmemory) != 0)
+ CERROR("Portals memory leaked: %d bytes\n",
+ atomic_read(&libcfs_kmemory));
+
+ rc = libcfs_debug_cleanup();
+ if (rc)
+ pr_err("LustreError: libcfs_debug_cleanup: %d\n", rc);
+
+ libcfs_arch_cleanup();
+}
+
+static int proc_call_handler(void *data, int write, loff_t *ppos,
+ void __user *buffer, size_t *lenp,
+ int (*handler)(void *data, int write,
+ loff_t pos, void __user *buffer, int len))
+{
+ int rc = handler(data, write, *ppos, buffer, *lenp);
+
+ if (rc < 0)
+ return rc;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+ *lenp = rc;
+ *ppos += rc;
+ }
+ return 0;
+}
+
+static int __proc_dobitmasks(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ const int tmpstrlen = 512;
+ char *tmpstr;
+ int rc;
+ unsigned int *mask = data;
+ int is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+ int is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+ rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+ if (rc < 0)
+ return rc;
+
+ if (!write) {
+ libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+ rc = strlen(tmpstr);
+
+ if (pos >= rc) {
+ rc = 0;
+ } else {
+ rc = cfs_trace_copyout_string(buffer, nob,
+ tmpstr + pos, "\n");
+ }
+ } else {
+ rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+ if (rc < 0) {
+ cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+ return rc;
+ }
+
+ rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+ /* Always print LBUG/LASSERT to console, so keep this mask */
+ if (is_printk)
+ *mask |= D_EMERG;
+ }
+
+ cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+ return rc;
+}
+
+static int proc_dobitmasks(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_dobitmasks);
+}
+
+static int min_watchdog_ratelimit; /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ if (!write)
+ return 0;
+
+ return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+static int proc_dump_kernel(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_dump_kernel);
+}
+
+static int __proc_daemon_file(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ if (!write) {
+ int len = strlen(cfs_tracefile);
+
+ if (pos >= len)
+ return 0;
+
+ return cfs_trace_copyout_string(buffer, nob,
+ cfs_tracefile + pos, "\n");
+ }
+
+ return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+static int proc_daemon_file(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_daemon_file);
+}
+
+static int __proc_debug_mb(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ if (!write) {
+ char tmpstr[32];
+ int len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+ cfs_trace_get_debug_mb());
+
+ if (pos >= len)
+ return 0;
+
+ return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+ "\n");
+ }
+
+ return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+static int proc_debug_mb(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_debug_mb);
+}
+
+static int proc_console_max_delay_cs(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int rc, max_delay_cs;
+ struct ctl_table dummy = *table;
+ long d;
+
+ dummy.data = &max_delay_cs;
+ dummy.proc_handler = &proc_dointvec;
+
+ if (!write) { /* read */
+ max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+ rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+ return rc;
+ }
+
+ /* write */
+ max_delay_cs = 0;
+ rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+ if (rc < 0)
+ return rc;
+ if (max_delay_cs <= 0)
+ return -EINVAL;
+
+ d = cfs_time_seconds(max_delay_cs) / 100;
+ if (d == 0 || d < libcfs_console_min_delay)
+ return -EINVAL;
+ libcfs_console_max_delay = d;
+
+ return rc;
+}
+
+static int proc_console_min_delay_cs(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int rc, min_delay_cs;
+ struct ctl_table dummy = *table;
+ long d;
+
+ dummy.data = &min_delay_cs;
+ dummy.proc_handler = &proc_dointvec;
+
+ if (!write) { /* read */
+ min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+ rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+ return rc;
+ }
+
+ /* write */
+ min_delay_cs = 0;
+ rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+ if (rc < 0)
+ return rc;
+ if (min_delay_cs <= 0)
+ return -EINVAL;
+
+ d = cfs_time_seconds(min_delay_cs) / 100;
+ if (d == 0 || d > libcfs_console_max_delay)
+ return -EINVAL;
+ libcfs_console_min_delay = d;
+
+ return rc;
+}
+
+static int proc_console_backoff(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc, backoff;
+ struct ctl_table dummy = *table;
+
+ dummy.data = &backoff;
+ dummy.proc_handler = &proc_dointvec;
+
+ if (!write) { /* read */
+ backoff = libcfs_console_backoff;
+ rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+ return rc;
+ }
+
+ /* write */
+ backoff = 0;
+ rc = proc_dointvec(&dummy, write, buffer, lenp, ppos);
+ if (rc < 0)
+ return rc;
+ if (backoff <= 0)
+ return -EINVAL;
+
+ libcfs_console_backoff = backoff;
+
+ return rc;
+}
+
+static int libcfs_force_lbug(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ if (write)
+ LBUG();
+ return 0;
+}
+
+static int proc_fail_loc(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int rc;
+ long old_fail_loc = cfs_fail_loc;
+
+ rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (old_fail_loc != cfs_fail_loc)
+ wake_up(&cfs_race_waitq);
+ return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+ loff_t pos, void __user *buffer, int nob)
+{
+ char *buf = NULL;
+ int len = 4096;
+ int rc = 0;
+
+ if (write)
+ return -EPERM;
+
+ LASSERT(cfs_cpt_table != NULL);
+
+ while (1) {
+ LIBCFS_ALLOC(buf, len);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+ if (rc >= 0)
+ break;
+
+ if (rc == -EFBIG) {
+ LIBCFS_FREE(buf, len);
+ len <<= 1;
+ continue;
+ }
+ goto out;
+ }
+
+ if (pos >= rc) {
+ rc = 0;
+ goto out;
+ }
+
+ rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+ if (buf != NULL)
+ LIBCFS_FREE(buf, len);
+ return rc;
+}
+
+static int proc_cpt_table(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_call_handler(table->data, write, ppos, buffer, lenp,
+ __proc_cpt_table);
+}
+
+static struct ctl_table lnet_table[] = {
+ /*
+ * NB No .strategy entries have been provided since sysctl(8) prefers
+ * to go via /proc for portability.
+ */
+ {
+ .procname = "debug",
+ .data = &libcfs_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dobitmasks,
+ },
+ {
+ .procname = "subsystem_debug",
+ .data = &libcfs_subsystem_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dobitmasks,
+ },
+ {
+ .procname = "printk",
+ .data = &libcfs_printk,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dobitmasks,
+ },
+ {
+ .procname = "console_ratelimit",
+ .data = &libcfs_console_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .procname = "console_max_delay_centisecs",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_console_max_delay_cs
+ },
+ {
+ .procname = "console_min_delay_centisecs",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_console_min_delay_cs
+ },
+ {
+ .procname = "console_backoff",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_console_backoff
+ },
+
+ {
+ .procname = "debug_path",
+ .data = libcfs_debug_file_path_arr,
+ .maxlen = sizeof(libcfs_debug_file_path_arr),
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ },
+
+ {
+ .procname = "cpu_partition_table",
+ .maxlen = 128,
+ .mode = 0444,
+ .proc_handler = &proc_cpt_table,
+ },
+
+ {
+ .procname = "upcall",
+ .data = lnet_upcall,
+ .maxlen = sizeof(lnet_upcall),
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ },
+ {
+ .procname = "debug_log_upcall",
+ .data = lnet_debug_log_upcall,
+ .maxlen = sizeof(lnet_debug_log_upcall),
+ .mode = 0644,
+ .proc_handler = &proc_dostring,
+ },
+ {
+ .procname = "lnet_memused",
+ .data = (int *)&libcfs_kmemory.counter,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "catastrophe",
+ .data = &libcfs_catastrophe,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "panic_on_lbug",
+ .data = &libcfs_panic_on_lbug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "dump_kernel",
+ .maxlen = 256,
+ .mode = 0200,
+ .proc_handler = &proc_dump_kernel,
+ },
+ {
+ .procname = "daemon_file",
+ .mode = 0644,
+ .maxlen = 256,
+ .proc_handler = &proc_daemon_file,
+ },
+ {
+ .procname = "debug_mb",
+ .mode = 0644,
+ .proc_handler = &proc_debug_mb,
+ },
+ {
+ .procname = "watchdog_ratelimit",
+ .data = &libcfs_watchdog_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &min_watchdog_ratelimit,
+ .extra2 = &max_watchdog_ratelimit,
+ },
+ {
+ .procname = "force_lbug",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0200,
+ .proc_handler = &libcfs_force_lbug
+ },
+ {
+ .procname = "fail_loc",
+ .data = &cfs_fail_loc,
+ .maxlen = sizeof(cfs_fail_loc),
+ .mode = 0644,
+ .proc_handler = &proc_fail_loc
+ },
+ {
+ .procname = "fail_val",
+ .data = &cfs_fail_val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ }
+};
+
+static struct ctl_table top_table[] = {
+ {
+ .procname = "lnet",
+ .mode = 0555,
+ .data = NULL,
+ .maxlen = 0,
+ .child = lnet_table,
+ },
+ {
+ }
+};
+
+static int insert_proc(void)
+{
+ if (lnet_table_header == NULL)
+ lnet_table_header = register_sysctl_table(top_table);
+ return 0;
+}
+
+static void remove_proc(void)
+{
+ if (lnet_table_header != NULL)
+ unregister_sysctl_table(lnet_table_header);
+
+ lnet_table_header = NULL;
+}
+
+MODULE_VERSION("1.0.0");
+
+module_init(init_libcfs_module);
+module_exit(exit_libcfs_module);
diff --git a/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/drivers/staging/lustre/lustre/libcfs/nidstrings.c
new file mode 100644
index 000000000..087449f4e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/nidstrings.c
@@ -0,0 +1,842 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../../include/linux/lnet/lnet.h"
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions. Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users. If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int libcfs_nidstring_idx;
+
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings(void)
+{
+ spin_lock_init(&libcfs_nidstring_lock);
+}
+
+static char *
+libcfs_next_nidstring(void)
+{
+ char *str;
+ unsigned long flags;
+
+ spin_lock_irqsave(&libcfs_nidstring_lock, flags);
+
+ str = libcfs_nidstrings[libcfs_nidstring_idx++];
+ if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings))
+ libcfs_nidstring_idx = 0;
+
+ spin_unlock_irqrestore(&libcfs_nidstring_lock, flags);
+ return str;
+}
+
+static int libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+ *addr = 0;
+ return 1;
+}
+
+static void libcfs_ip_addr2str(__u32 addr, char *str)
+{
+ snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+ (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+ (addr >> 8) & 0xff, addr & 0xff);
+}
+
+static int libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+ unsigned int a;
+ unsigned int b;
+ unsigned int c;
+ unsigned int d;
+ int n = nob; /* XscanfX */
+
+ /* numeric IP? */
+ if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+ n == nob &&
+ (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+ (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+ *addr = ((a<<24)|(b<<16)|(c<<8)|d);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+ snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+static void libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+ snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+static int libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+ int n;
+
+ n = nob;
+ if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+ return 1;
+
+ n = nob;
+ if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+ return 1;
+
+ n = nob;
+ if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+ struct cfs_expr_list *el;
+ int rc;
+
+ rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+ if (rc == 0)
+ list_add_tail(&el->el_link, list);
+
+ return rc;
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+ struct cfs_expr_list *el;
+
+ LASSERT(!list_empty(numaddr));
+ el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+ return cfs_expr_list_match(addr, el);
+}
+
+struct netstrfns {
+ int nf_type;
+ char *nf_name;
+ char *nf_modname;
+ void (*nf_addr2str)(__u32 addr, char *str);
+ int (*nf_str2addr)(const char *str, int nob, __u32 *addr);
+ int (*nf_parse_addrlist)(char *str, int len,
+ struct list_head *list);
+ int (*nf_match_addr)(__u32 addr, struct list_head *list);
+};
+
+static struct netstrfns libcfs_netstrfns[] = {
+ {/* .nf_type */ LOLND,
+ /* .nf_name */ "lo",
+ /* .nf_modname */ "klolnd",
+ /* .nf_addr2str */ libcfs_decnum_addr2str,
+ /* .nf_str2addr */ libcfs_lo_str2addr,
+ /* .nf_parse_addr*/ libcfs_num_parse,
+ /* .nf_match_addr*/ libcfs_num_match},
+ {/* .nf_type */ SOCKLND,
+ /* .nf_name */ "tcp",
+ /* .nf_modname */ "ksocklnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ O2IBLND,
+ /* .nf_name */ "o2ib",
+ /* .nf_modname */ "ko2iblnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ CIBLND,
+ /* .nf_name */ "cib",
+ /* .nf_modname */ "kciblnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ OPENIBLND,
+ /* .nf_name */ "openib",
+ /* .nf_modname */ "kopeniblnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ IIBLND,
+ /* .nf_name */ "iib",
+ /* .nf_modname */ "kiiblnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ VIBLND,
+ /* .nf_name */ "vib",
+ /* .nf_modname */ "kviblnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ RALND,
+ /* .nf_name */ "ra",
+ /* .nf_modname */ "kralnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ QSWLND,
+ /* .nf_name */ "elan",
+ /* .nf_modname */ "kqswlnd",
+ /* .nf_addr2str */ libcfs_decnum_addr2str,
+ /* .nf_str2addr */ libcfs_num_str2addr,
+ /* .nf_parse_addrlist*/ libcfs_num_parse,
+ /* .nf_match_addr*/ libcfs_num_match},
+ {/* .nf_type */ GMLND,
+ /* .nf_name */ "gm",
+ /* .nf_modname */ "kgmlnd",
+ /* .nf_addr2str */ libcfs_hexnum_addr2str,
+ /* .nf_str2addr */ libcfs_num_str2addr,
+ /* .nf_parse_addrlist*/ libcfs_num_parse,
+ /* .nf_match_addr*/ libcfs_num_match},
+ {/* .nf_type */ MXLND,
+ /* .nf_name */ "mx",
+ /* .nf_modname */ "kmxlnd",
+ /* .nf_addr2str */ libcfs_ip_addr2str,
+ /* .nf_str2addr */ libcfs_ip_str2addr,
+ /* .nf_parse_addrlist*/ cfs_ip_addr_parse,
+ /* .nf_match_addr*/ cfs_ip_addr_match},
+ {/* .nf_type */ PTLLND,
+ /* .nf_name */ "ptl",
+ /* .nf_modname */ "kptllnd",
+ /* .nf_addr2str */ libcfs_decnum_addr2str,
+ /* .nf_str2addr */ libcfs_num_str2addr,
+ /* .nf_parse_addrlist*/ libcfs_num_parse,
+ /* .nf_match_addr*/ libcfs_num_match},
+ {/* .nf_type */ GNILND,
+ /* .nf_name */ "gni",
+ /* .nf_modname */ "kgnilnd",
+ /* .nf_addr2str */ libcfs_decnum_addr2str,
+ /* .nf_str2addr */ libcfs_num_str2addr,
+ /* .nf_parse_addrlist*/ libcfs_num_parse,
+ /* .nf_match_addr*/ libcfs_num_match},
+ /* placeholder for net0 alias. It MUST BE THE LAST ENTRY */
+ {/* .nf_type */ -1},
+};
+
+static const int libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns);
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk. However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length. If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+static struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+ int i;
+
+ if (lnd >= 0)
+ for (i = 0; i < libcfs_nnetstrfns; i++)
+ if (lnd == libcfs_netstrfns[i].nf_type)
+ return &libcfs_netstrfns[i];
+
+ return NULL;
+}
+
+static struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+ struct netstrfns *nf;
+ int i;
+
+ for (i = 0; i < libcfs_nnetstrfns; i++) {
+ nf = &libcfs_netstrfns[i];
+ if (nf->nf_type >= 0 &&
+ !strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+ return nf;
+ }
+ return NULL;
+}
+
+static struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+ int i;
+
+ for (i = 0; i < libcfs_nnetstrfns; i++)
+ if (libcfs_netstrfns[i].nf_type >= 0 &&
+ !strcmp(libcfs_netstrfns[i].nf_name, name))
+ return &libcfs_netstrfns[i];
+
+ return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+ return libcfs_lnd2netstrfns(type) != NULL;
+}
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+ struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+ return (nf == NULL) ? NULL : nf->nf_modname;
+}
+EXPORT_SYMBOL(libcfs_lnd2modname);
+
+char *
+libcfs_lnd2str(int lnd)
+{
+ char *str;
+ struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+ if (nf != NULL)
+ return nf->nf_name;
+
+ str = libcfs_next_nidstring();
+ snprintf(str, LNET_NIDSTR_SIZE, "?%d?", lnd);
+ return str;
+}
+EXPORT_SYMBOL(libcfs_lnd2str);
+
+int
+libcfs_str2lnd(const char *str)
+{
+ struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+ if (nf != NULL)
+ return nf->nf_type;
+
+ return -1;
+}
+EXPORT_SYMBOL(libcfs_str2lnd);
+
+char *
+libcfs_net2str(__u32 net)
+{
+ int lnd = LNET_NETTYP(net);
+ int num = LNET_NETNUM(net);
+ struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+ char *str = libcfs_next_nidstring();
+
+ if (nf == NULL)
+ snprintf(str, LNET_NIDSTR_SIZE, "<%d:%d>", lnd, num);
+ else if (num == 0)
+ snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+ else
+ snprintf(str, LNET_NIDSTR_SIZE, "%s%d", nf->nf_name, num);
+
+ return str;
+}
+EXPORT_SYMBOL(libcfs_net2str);
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+ __u32 addr = LNET_NIDADDR(nid);
+ __u32 net = LNET_NIDNET(nid);
+ int lnd = LNET_NETTYP(net);
+ int nnum = LNET_NETNUM(net);
+ struct netstrfns *nf;
+ char *str;
+ int nob;
+
+ if (nid == LNET_NID_ANY)
+ return "<?>";
+
+ nf = libcfs_lnd2netstrfns(lnd);
+ str = libcfs_next_nidstring();
+
+ if (nf == NULL)
+ snprintf(str, LNET_NIDSTR_SIZE, "%x@<%d:%d>", addr, lnd, nnum);
+ else {
+ nf->nf_addr2str(addr, str);
+ nob = strlen(str);
+ if (nnum == 0)
+ snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+ nf->nf_name);
+ else
+ snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%d",
+ nf->nf_name, nnum);
+ }
+
+ return str;
+}
+EXPORT_SYMBOL(libcfs_nid2str);
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+ struct netstrfns *uninitialized_var(nf);
+ int nob;
+ unsigned int netnum;
+ int i;
+
+ for (i = 0; i < libcfs_nnetstrfns; i++) {
+ nf = &libcfs_netstrfns[i];
+ if (nf->nf_type >= 0 &&
+ !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+ break;
+ }
+
+ if (i == libcfs_nnetstrfns)
+ return NULL;
+
+ nob = strlen(nf->nf_name);
+
+ if (strlen(str) == (unsigned int)nob) {
+ netnum = 0;
+ } else {
+ if (nf->nf_type == LOLND) /* net number not allowed */
+ return NULL;
+
+ str += nob;
+ i = strlen(str);
+ if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+ i != (int)strlen(str))
+ return NULL;
+ }
+
+ *net = LNET_MKNET(nf->nf_type, netnum);
+ return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+ __u32 net;
+
+ if (libcfs_str2net_internal(str, &net) != NULL)
+ return net;
+
+ return LNET_NIDNET(LNET_NID_ANY);
+}
+EXPORT_SYMBOL(libcfs_str2net);
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+ const char *sep = strchr(str, '@');
+ struct netstrfns *nf;
+ __u32 net;
+ __u32 addr;
+
+ if (sep != NULL) {
+ nf = libcfs_str2net_internal(sep + 1, &net);
+ if (nf == NULL)
+ return LNET_NID_ANY;
+ } else {
+ sep = str + strlen(str);
+ net = LNET_MKNET(SOCKLND, 0);
+ nf = libcfs_lnd2netstrfns(SOCKLND);
+ LASSERT(nf != NULL);
+ }
+
+ if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+ return LNET_NID_ANY;
+
+ return LNET_MKNID(net, addr);
+}
+EXPORT_SYMBOL(libcfs_str2nid);
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+ char *str = libcfs_next_nidstring();
+
+ if (id.pid == LNET_PID_ANY) {
+ snprintf(str, LNET_NIDSTR_SIZE,
+ "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+ return str;
+ }
+
+ snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+ ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+ (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+ return str;
+}
+EXPORT_SYMBOL(libcfs_id2str);
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+ if (!strcmp(str, "*")) {
+ *nidp = LNET_NID_ANY;
+ return 1;
+ }
+
+ *nidp = libcfs_str2nid(str);
+ return *nidp != LNET_NID_ANY;
+}
+EXPORT_SYMBOL(libcfs_str2anynid);
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist> :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange> :== <addrrange> '@' <net>
+ * <addrrange> :== '*' |
+ * <ipaddr_range> |
+ * <cfs_expr_list>
+ * <ipaddr_range> :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ * <cfs_expr_list>
+ * <cfs_expr_list> :== <number> |
+ * <expr_list>
+ * <expr_list> :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr> :== <number> |
+ * <number> '-' <number> |
+ * <number> '-' <number> '/' <number>
+ * <net> :== <netname> | <netname><number>
+ * <netname> :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ * "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+ /**
+ * Link to list of this structures which is built on nid range
+ * list parsing.
+ */
+ struct list_head nr_link;
+ /**
+ * List head for addrrange::ar_link.
+ */
+ struct list_head nr_addrranges;
+ /**
+ * Flag indicating that *@<net> is found.
+ */
+ int nr_all;
+ /**
+ * Pointer to corresponding element of libcfs_netstrfns.
+ */
+ struct netstrfns *nr_netstrfns;
+ /**
+ * Number of network. E.g. 5 if \<net\> is "elan5".
+ */
+ int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+ /**
+ * Link to nidrange::nr_addrranges.
+ */
+ struct list_head ar_link;
+ /**
+ * List head for cfs_expr_list::el_list.
+ */
+ struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 1 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval 0 otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+ struct addrrange *addrrange;
+
+ if (src->ls_len == 1 && src->ls_str[0] == '*') {
+ nidrange->nr_all = 1;
+ return 1;
+ }
+
+ LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+ if (addrrange == NULL)
+ return 0;
+ list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+ INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+ return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+ src->ls_len,
+ &addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+ struct list_head *nidlist)
+{
+ struct netstrfns *nf;
+ struct nidrange *nr;
+ int endlen;
+ unsigned netnum;
+
+ if (src->ls_len >= LNET_NIDSTR_SIZE)
+ return NULL;
+
+ nf = libcfs_namenum2netstrfns(src->ls_str);
+ if (nf == NULL)
+ return NULL;
+ endlen = src->ls_len - strlen(nf->nf_name);
+ if (endlen == 0)
+ /* network name only, e.g. "elan" or "tcp" */
+ netnum = 0;
+ else {
+ /* e.g. "elan25" or "tcp23", refuse to parse if
+ * network name is not appended with decimal or
+ * hexadecimal number */
+ if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+ endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+ return NULL;
+ }
+
+ list_for_each_entry(nr, nidlist, nr_link) {
+ if (nr->nr_netstrfns != nf)
+ continue;
+ if (nr->nr_netnum != netnum)
+ continue;
+ return nr;
+ }
+
+ LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+ if (nr == NULL)
+ return NULL;
+ list_add_tail(&nr->nr_link, nidlist);
+ INIT_LIST_HEAD(&nr->nr_addrranges);
+ nr->nr_netstrfns = nf;
+ nr->nr_all = 0;
+ nr->nr_netnum = netnum;
+
+ return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+ struct cfs_lstr addrrange;
+ struct cfs_lstr net;
+ struct cfs_lstr tmp;
+ struct nidrange *nr;
+
+ tmp = *src;
+ if (cfs_gettok(src, '@', &addrrange) == 0)
+ goto failed;
+
+ if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+ goto failed;
+
+ nr = add_nidrange(&net, nidlist);
+ if (nr == NULL)
+ goto failed;
+
+ if (parse_addrange(&addrrange, nr) != 0)
+ goto failed;
+
+ return 1;
+ failed:
+ CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str);
+ return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+ while (!list_empty(list)) {
+ struct addrrange *ar;
+
+ ar = list_entry(list->next, struct addrrange, ar_link);
+
+ cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+ list_del(&ar->ar_link);
+ LIBCFS_FREE(ar, sizeof(struct addrrange));
+ }
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+ struct list_head *pos, *next;
+ struct nidrange *nr;
+
+ list_for_each_safe(pos, next, list) {
+ nr = list_entry(pos, struct nidrange, nr_link);
+ free_addrranges(&nr->nr_addrranges);
+ list_del(pos);
+ LIBCFS_FREE(nr, sizeof(struct nidrange));
+ }
+}
+EXPORT_SYMBOL(cfs_free_nidlist);
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+ struct cfs_lstr src;
+ struct cfs_lstr res;
+ int rc;
+
+ src.ls_str = str;
+ src.ls_len = len;
+ INIT_LIST_HEAD(nidlist);
+ while (src.ls_str) {
+ rc = cfs_gettok(&src, ' ', &res);
+ if (rc == 0) {
+ cfs_free_nidlist(nidlist);
+ return 0;
+ }
+ rc = parse_nidrange(&res, nidlist);
+ if (rc == 0) {
+ cfs_free_nidlist(nidlist);
+ return 0;
+ }
+ }
+ return 1;
+}
+EXPORT_SYMBOL(cfs_parse_nidlist);
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0 otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+ struct nidrange *nr;
+ struct addrrange *ar;
+
+ list_for_each_entry(nr, nidlist, nr_link) {
+ if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+ continue;
+ if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+ continue;
+ if (nr->nr_all)
+ return 1;
+ list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+ if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+ &ar->ar_numaddr_ranges))
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cfs_match_nid);
diff --git a/drivers/staging/lustre/lustre/libcfs/prng.c b/drivers/staging/lustre/lustre/libcfs/prng.c
new file mode 100644
index 000000000..4147664ff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/prng.c
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new pseudo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+ seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+ seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+ return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the initial seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+ if (seed1)
+ seed_x = seed1; /* use default seeds if parameter is 0 */
+ if (seed2)
+ seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+ int *p = buf;
+ int rem, tmp;
+
+ LASSERT(size >= 0);
+
+ rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+ if (rem) {
+ get_random_bytes(&tmp, sizeof(tmp));
+ tmp ^= cfs_rand();
+ memcpy(buf, &tmp, rem);
+ p = buf + rem;
+ size -= rem;
+ }
+
+ while (size >= sizeof(int)) {
+ get_random_bytes(&tmp, sizeof(tmp));
+ *p = cfs_rand() ^ tmp;
+ size -= sizeof(int);
+ p++;
+ }
+ buf = p;
+ if (size) {
+ get_random_bytes(&tmp, sizeof(tmp));
+ tmp ^= cfs_rand();
+ memcpy(buf, &tmp, size);
+ }
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c
new file mode 100644
index 000000000..c86394f7f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/tracefile.c
@@ -0,0 +1,1196 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+struct mutex cfs_trace_thread_mutex;
+static int thread_running;
+
+static atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+ struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+ return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp)
+{
+ struct page *page;
+ struct cfs_trace_page *tage;
+
+ /* My caller is trying to free memory */
+ if (!in_interrupt() && memory_pressure_get())
+ return NULL;
+
+ /*
+ * Don't spam console with allocation failures: they will be reported
+ * by upper layer anyway.
+ */
+ gfp |= __GFP_NOWARN;
+ page = alloc_page(gfp);
+ if (page == NULL)
+ return NULL;
+
+ tage = kmalloc(sizeof(*tage), gfp);
+ if (tage == NULL) {
+ __free_page(page);
+ return NULL;
+ }
+
+ tage->page = page;
+ atomic_inc(&cfs_tage_allocated);
+ return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+ __LASSERT(tage != NULL);
+ __LASSERT(tage->page != NULL);
+
+ __free_page(tage->page);
+ kfree(tage);
+ atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+ struct list_head *queue)
+{
+ __LASSERT(tage != NULL);
+ __LASSERT(queue != NULL);
+
+ list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+ struct list_head *stock)
+{
+ int i;
+
+ /*
+ * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+ * from here: this will lead to infinite recursion.
+ */
+
+ for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+ struct cfs_trace_page *tage;
+
+ tage = cfs_tage_alloc(gfp);
+ if (tage == NULL)
+ break;
+ list_add_tail(&tage->linkage, stock);
+ }
+ return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+ struct cfs_trace_page *tage;
+
+ if (tcd->tcd_cur_pages > 0) {
+ __LASSERT(!list_empty(&tcd->tcd_pages));
+ tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+ if (tage->used + len <= PAGE_CACHE_SIZE)
+ return tage;
+ }
+
+ if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+ if (tcd->tcd_cur_stock_pages > 0) {
+ tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+ --tcd->tcd_cur_stock_pages;
+ list_del_init(&tage->linkage);
+ } else {
+ tage = cfs_tage_alloc(GFP_ATOMIC);
+ if (unlikely(tage == NULL)) {
+ if ((!memory_pressure_get() ||
+ in_interrupt()) && printk_ratelimit())
+ printk(KERN_WARNING
+ "cannot allocate a tage (%ld)\n",
+ tcd->tcd_cur_pages);
+ return NULL;
+ }
+ }
+
+ tage->used = 0;
+ tage->cpu = smp_processor_id();
+ tage->type = tcd->tcd_type;
+ list_add_tail(&tage->linkage, &tcd->tcd_pages);
+ tcd->tcd_cur_pages++;
+
+ if (tcd->tcd_cur_pages > 8 && thread_running) {
+ struct tracefiled_ctl *tctl = &trace_tctl;
+ /*
+ * wake up tracefiled to process some pages.
+ */
+ wake_up(&tctl->tctl_waitq);
+ }
+ return tage;
+ }
+ return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+ int pgcount = tcd->tcd_cur_pages / 10;
+ struct page_collection pc;
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+
+ /*
+ * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+ * from here: this will lead to infinite recursion.
+ */
+
+ if (printk_ratelimit())
+ printk(KERN_WARNING "debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n",
+ pgcount + 1, tcd->tcd_cur_pages);
+
+ INIT_LIST_HEAD(&pc.pc_pages);
+ spin_lock_init(&pc.pc_lock);
+
+ list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+ if (pgcount-- == 0)
+ break;
+
+ list_move_tail(&tage->linkage, &pc.pc_pages);
+ tcd->tcd_cur_pages--;
+ }
+ put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+ unsigned long len)
+{
+ struct cfs_trace_page *tage;
+
+ /*
+ * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+ * from here: this will lead to infinite recursion.
+ */
+
+ if (len > PAGE_CACHE_SIZE) {
+ pr_err("cowardly refusing to write %lu bytes in a page\n", len);
+ return NULL;
+ }
+
+ tage = cfs_trace_get_tage_try(tcd, len);
+ if (tage != NULL)
+ return tage;
+ if (thread_running)
+ cfs_tcd_shrink(tcd);
+ if (tcd->tcd_cur_pages > 0) {
+ tage = cfs_tage_from_list(tcd->tcd_pages.next);
+ tage->used = 0;
+ cfs_tage_to_tail(tage, &tcd->tcd_pages);
+ }
+ return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+ const char *format, ...)
+{
+ va_list args;
+ int rc;
+
+ va_start(args, format);
+ rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+ va_end(args);
+
+ return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+ const char *format1, va_list args,
+ const char *format2, ...)
+{
+ struct cfs_trace_cpu_data *tcd = NULL;
+ struct ptldebug_header header = {0};
+ struct cfs_trace_page *tage;
+ /* string_buf is used only if tcd != NULL, and is always set then */
+ char *string_buf = NULL;
+ char *debug_buf;
+ int known_size;
+ int needed = 85; /* average message length */
+ int max_nob;
+ va_list ap;
+ int depth;
+ int i;
+ int remain;
+ int mask = msgdata->msg_mask;
+ const char *file = kbasename(msgdata->msg_file);
+ struct cfs_debug_limit_state *cdls = msgdata->msg_cdls;
+
+ tcd = cfs_trace_get_tcd();
+
+ /* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+ * pins us to a particular CPU. This avoids an smp_processor_id()
+ * warning on Linux when debugging is enabled. */
+ cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+ if (tcd == NULL) /* arch may not log in IRQ context */
+ goto console;
+
+ if (tcd->tcd_cur_pages == 0)
+ header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+ if (tcd->tcd_shutting_down) {
+ cfs_trace_put_tcd(tcd);
+ tcd = NULL;
+ goto console;
+ }
+
+ depth = __current_nesting_level();
+ known_size = strlen(file) + 1 + depth;
+ if (msgdata->msg_fn)
+ known_size += strlen(msgdata->msg_fn) + 1;
+
+ if (libcfs_debug_binary)
+ known_size += sizeof(header);
+
+ /*/
+ * '2' used because vsnprintf return real size required for output
+ * _without_ terminating NULL.
+ * if needed is to small for this format.
+ */
+ for (i = 0; i < 2; i++) {
+ tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+ if (tage == NULL) {
+ if (needed + known_size > PAGE_CACHE_SIZE)
+ mask |= D_ERROR;
+
+ cfs_trace_put_tcd(tcd);
+ tcd = NULL;
+ goto console;
+ }
+
+ string_buf = (char *)page_address(tage->page) +
+ tage->used + known_size;
+
+ max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+ if (max_nob <= 0) {
+ printk(KERN_EMERG "negative max_nob: %d\n",
+ max_nob);
+ mask |= D_ERROR;
+ cfs_trace_put_tcd(tcd);
+ tcd = NULL;
+ goto console;
+ }
+
+ needed = 0;
+ if (format1) {
+ va_copy(ap, args);
+ needed = vsnprintf(string_buf, max_nob, format1, ap);
+ va_end(ap);
+ }
+
+ if (format2) {
+ remain = max_nob - needed;
+ if (remain < 0)
+ remain = 0;
+
+ va_start(ap, format2);
+ needed += vsnprintf(string_buf + needed, remain,
+ format2, ap);
+ va_end(ap);
+ }
+
+ if (needed < max_nob) /* well. printing ok.. */
+ break;
+ }
+
+ if (*(string_buf+needed-1) != '\n')
+ printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n",
+ file, msgdata->msg_line, msgdata->msg_fn);
+
+ header.ph_len = known_size + needed;
+ debug_buf = (char *)page_address(tage->page) + tage->used;
+
+ if (libcfs_debug_binary) {
+ memcpy(debug_buf, &header, sizeof(header));
+ tage->used += sizeof(header);
+ debug_buf += sizeof(header);
+ }
+
+ /* indent message according to the nesting level */
+ while (depth-- > 0) {
+ *(debug_buf++) = '.';
+ ++ tage->used;
+ }
+
+ strcpy(debug_buf, file);
+ tage->used += strlen(file) + 1;
+ debug_buf += strlen(file) + 1;
+
+ if (msgdata->msg_fn) {
+ strcpy(debug_buf, msgdata->msg_fn);
+ tage->used += strlen(msgdata->msg_fn) + 1;
+ debug_buf += strlen(msgdata->msg_fn) + 1;
+ }
+
+ __LASSERT(debug_buf == string_buf);
+
+ tage->used += needed;
+ __LASSERT (tage->used <= PAGE_CACHE_SIZE);
+
+console:
+ if ((mask & libcfs_printk) == 0) {
+ /* no console output requested */
+ if (tcd != NULL)
+ cfs_trace_put_tcd(tcd);
+ return 1;
+ }
+
+ if (cdls != NULL) {
+ if (libcfs_console_ratelimit &&
+ cdls->cdls_next != 0 && /* not first time ever */
+ !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+ /* skipping a console message */
+ cdls->cdls_count++;
+ if (tcd != NULL)
+ cfs_trace_put_tcd(tcd);
+ return 1;
+ }
+
+ if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+ libcfs_console_max_delay
+ + cfs_time_seconds(10))) {
+ /* last timeout was a long time ago */
+ cdls->cdls_delay /= libcfs_console_backoff * 4;
+ } else {
+ cdls->cdls_delay *= libcfs_console_backoff;
+ }
+
+ if (cdls->cdls_delay < libcfs_console_min_delay)
+ cdls->cdls_delay = libcfs_console_min_delay;
+ else if (cdls->cdls_delay > libcfs_console_max_delay)
+ cdls->cdls_delay = libcfs_console_max_delay;
+
+ /* ensure cdls_next is never zero after it's been seen */
+ cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+ }
+
+ if (tcd != NULL) {
+ cfs_print_to_console(&header, mask, string_buf, needed, file,
+ msgdata->msg_fn);
+ cfs_trace_put_tcd(tcd);
+ } else {
+ string_buf = cfs_trace_get_console_buffer();
+
+ needed = 0;
+ if (format1 != NULL) {
+ va_copy(ap, args);
+ needed = vsnprintf(string_buf,
+ CFS_TRACE_CONSOLE_BUFFER_SIZE,
+ format1, ap);
+ va_end(ap);
+ }
+ if (format2 != NULL) {
+ remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+ if (remain > 0) {
+ va_start(ap, format2);
+ needed += vsnprintf(string_buf+needed, remain,
+ format2, ap);
+ va_end(ap);
+ }
+ }
+ cfs_print_to_console(&header, mask,
+ string_buf, needed, file, msgdata->msg_fn);
+
+ cfs_trace_put_console_buffer(string_buf);
+ }
+
+ if (cdls != NULL && cdls->cdls_count != 0) {
+ string_buf = cfs_trace_get_console_buffer();
+
+ needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+ "Skipped %d previous similar message%s\n",
+ cdls->cdls_count,
+ (cdls->cdls_count > 1) ? "s" : "");
+
+ cfs_print_to_console(&header, mask,
+ string_buf, needed, file, msgdata->msg_fn);
+
+ cfs_trace_put_console_buffer(string_buf);
+ cdls->cdls_count = 0;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+ struct libcfs_debug_msg_data *msgdata)
+{
+ struct ptldebug_header hdr;
+
+ libcfs_panic_in_progress = 1;
+ libcfs_catastrophe = 1;
+ mb();
+
+ cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+ cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+ msgdata->msg_file, msgdata->msg_fn);
+
+ panic("Lustre debug assertion failure\n");
+
+ /* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+ /* Do the collect_pages job on a single CPU: assumes that all other
+ * CPUs have been stopped during a panic. If this isn't true for some
+ * arch, this will have to be implemented separately in each arch. */
+ int i;
+ int j;
+ struct cfs_trace_cpu_data *tcd;
+
+ INIT_LIST_HEAD(&pc->pc_pages);
+
+ cfs_tcd_for_each(tcd, i, j) {
+ list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+ tcd->tcd_cur_pages = 0;
+
+ if (pc->pc_want_daemon_pages) {
+ list_splice_init(&tcd->tcd_daemon_pages,
+ &pc->pc_pages);
+ tcd->tcd_cur_daemon_pages = 0;
+ }
+ }
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+ struct cfs_trace_cpu_data *tcd;
+ int i, cpu;
+
+ spin_lock(&pc->pc_lock);
+ for_each_possible_cpu(cpu) {
+ cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+ list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+ tcd->tcd_cur_pages = 0;
+ if (pc->pc_want_daemon_pages) {
+ list_splice_init(&tcd->tcd_daemon_pages,
+ &pc->pc_pages);
+ tcd->tcd_cur_daemon_pages = 0;
+ }
+ }
+ }
+ spin_unlock(&pc->pc_lock);
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+ INIT_LIST_HEAD(&pc->pc_pages);
+
+ if (libcfs_panic_in_progress)
+ panic_collect_pages(pc);
+ else
+ collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+ struct cfs_trace_cpu_data *tcd;
+ struct list_head *cur_head;
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+ int i, cpu;
+
+ spin_lock(&pc->pc_lock);
+ for_each_possible_cpu(cpu) {
+ cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+ cur_head = tcd->tcd_pages.next;
+
+ list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+ linkage) {
+
+ __LASSERT_TAGE_INVARIANT(tage);
+
+ if (tage->cpu != cpu || tage->type != i)
+ continue;
+
+ cfs_tage_to_tail(tage, cur_head);
+ tcd->tcd_cur_pages++;
+ }
+ }
+ }
+ spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+ if (!libcfs_panic_in_progress)
+ put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+ struct cfs_trace_cpu_data *tcd)
+{
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+
+ spin_lock(&pc->pc_lock);
+ list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+
+ __LASSERT_TAGE_INVARIANT(tage);
+
+ if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+ continue;
+
+ cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+ tcd->tcd_cur_daemon_pages++;
+
+ if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+ struct cfs_trace_page *victim;
+
+ __LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+ victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+ __LASSERT_TAGE_INVARIANT(victim);
+
+ list_del(&victim->linkage);
+ cfs_tage_free(victim);
+ tcd->tcd_cur_daemon_pages--;
+ }
+ }
+ spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+ struct cfs_trace_cpu_data *tcd;
+ int i, cpu;
+
+ for_each_possible_cpu(cpu) {
+ cfs_tcd_for_each_type_lock(tcd, i, cpu)
+ put_pages_on_tcd_daemon_list(pc, tcd);
+ }
+}
+
+void cfs_trace_debug_print(void)
+{
+ struct page_collection pc;
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+
+ spin_lock_init(&pc.pc_lock);
+
+ pc.pc_want_daemon_pages = 1;
+ collect_pages(&pc);
+ list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+ char *p, *file, *fn;
+ struct page *page;
+
+ __LASSERT_TAGE_INVARIANT(tage);
+
+ page = tage->page;
+ p = page_address(page);
+ while (p < ((char *)page_address(page) + tage->used)) {
+ struct ptldebug_header *hdr;
+ int len;
+ hdr = (void *)p;
+ p += sizeof(*hdr);
+ file = p;
+ p += strlen(file) + 1;
+ fn = p;
+ p += strlen(fn) + 1;
+ len = hdr->ph_len - (int)(p - (char *)hdr);
+
+ cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+ p += len;
+ }
+
+ list_del(&tage->linkage);
+ cfs_tage_free(tage);
+ }
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+ struct page_collection pc;
+ struct file *filp;
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+ char *buf;
+ int rc;
+
+ DECL_MMSPACE;
+
+ cfs_tracefile_write_lock();
+
+ filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+ if (IS_ERR(filp)) {
+ rc = PTR_ERR(filp);
+ filp = NULL;
+ pr_err("LustreError: can't open %s for dump: rc %d\n",
+ filename, rc);
+ goto out;
+ }
+
+ spin_lock_init(&pc.pc_lock);
+ pc.pc_want_daemon_pages = 1;
+ collect_pages(&pc);
+ if (list_empty(&pc.pc_pages)) {
+ rc = 0;
+ goto close;
+ }
+
+ /* ok, for now, just write the pages. in the future we'll be building
+ * iobufs with the pages and calling generic_direct_IO */
+ MMSPACE_OPEN;
+ list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+ __LASSERT_TAGE_INVARIANT(tage);
+
+ buf = kmap(tage->page);
+ rc = vfs_write(filp, (__force const char __user *)buf,
+ tage->used, &filp->f_pos);
+ kunmap(tage->page);
+
+ if (rc != (int)tage->used) {
+ printk(KERN_WARNING "wanted to write %u but wrote %d\n",
+ tage->used, rc);
+ put_pages_back(&pc);
+ __LASSERT(list_empty(&pc.pc_pages));
+ break;
+ }
+ list_del(&tage->linkage);
+ cfs_tage_free(tage);
+ }
+ MMSPACE_CLOSE;
+ rc = vfs_fsync(filp, 1);
+ if (rc)
+ pr_err("sync returns %d\n", rc);
+close:
+ filp_close(filp, NULL);
+out:
+ cfs_tracefile_write_unlock();
+ return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+ struct page_collection pc;
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+
+ spin_lock_init(&pc.pc_lock);
+
+ pc.pc_want_daemon_pages = 1;
+ collect_pages(&pc);
+ list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+ __LASSERT_TAGE_INVARIANT(tage);
+
+ list_del(&tage->linkage);
+ cfs_tage_free(tage);
+ }
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+ const char __user *usr_buffer, int usr_buffer_nob)
+{
+ int nob;
+
+ if (usr_buffer_nob > knl_buffer_nob)
+ return -EOVERFLOW;
+
+ if (copy_from_user((void *)knl_buffer,
+ usr_buffer, usr_buffer_nob))
+ return -EFAULT;
+
+ nob = strnlen(knl_buffer, usr_buffer_nob);
+ while (nob-- >= 0) /* strip trailing whitespace */
+ if (!isspace(knl_buffer[nob]))
+ break;
+
+ if (nob < 0) /* empty string */
+ return -EINVAL;
+
+ if (nob == knl_buffer_nob) /* no space to terminate */
+ return -EOVERFLOW;
+
+ knl_buffer[nob + 1] = 0; /* terminate */
+ return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+ const char *knl_buffer, char *append)
+{
+ /* NB if 'append' != NULL, it's a single character to append to the
+ * copied out string - usually "\n", for /proc entries and "" (i.e. a
+ * terminating zero byte) for sysctl entries */
+ int nob = strlen(knl_buffer);
+
+ if (nob > usr_buffer_nob)
+ nob = usr_buffer_nob;
+
+ if (copy_to_user(usr_buffer, knl_buffer, nob))
+ return -EFAULT;
+
+ if (append != NULL && nob < usr_buffer_nob) {
+ if (copy_to_user(usr_buffer + nob, append, 1))
+ return -EFAULT;
+
+ nob++;
+ }
+
+ return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+ if (nob > 2 * PAGE_CACHE_SIZE) /* string must be "sensible" */
+ return -EINVAL;
+
+ *str = kmalloc(nob, GFP_IOFS | __GFP_ZERO);
+ if (*str == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void cfs_trace_free_string_buffer(char *str, int nob)
+{
+ kfree(str);
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob)
+{
+ char *str;
+ int rc;
+
+ rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+ if (rc != 0)
+ return rc;
+
+ rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+ usr_str, usr_str_nob);
+ if (rc != 0)
+ goto out;
+
+ if (str[0] != '/') {
+ rc = -EINVAL;
+ goto out;
+ }
+ rc = cfs_tracefile_dump_all_pages(str);
+out:
+ cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+ return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+ int rc = 0;
+
+ cfs_tracefile_write_lock();
+
+ if (strcmp(str, "stop") == 0) {
+ cfs_tracefile_write_unlock();
+ cfs_trace_stop_thread();
+ cfs_tracefile_write_lock();
+ memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+ } else if (strncmp(str, "size=", 5) == 0) {
+ cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+ if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+ cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+ else
+ cfs_tracefile_size <<= 20;
+
+ } else if (strlen(str) >= sizeof(cfs_tracefile)) {
+ rc = -ENAMETOOLONG;
+ } else if (str[0] != '/') {
+ rc = -EINVAL;
+ } else {
+ strcpy(cfs_tracefile, str);
+
+ printk(KERN_INFO
+ "Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n",
+ cfs_tracefile,
+ (long)(cfs_tracefile_size >> 10));
+
+ cfs_trace_start_thread();
+ }
+
+ cfs_tracefile_write_unlock();
+ return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob)
+{
+ char *str;
+ int rc;
+
+ rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+ if (rc != 0)
+ return rc;
+
+ rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+ usr_str, usr_str_nob);
+ if (rc == 0)
+ rc = cfs_trace_daemon_command(str);
+
+ cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+ return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+ int i;
+ int j;
+ int pages;
+ int limit = cfs_trace_max_debug_mb();
+ struct cfs_trace_cpu_data *tcd;
+
+ if (mb < num_possible_cpus()) {
+ printk(KERN_WARNING
+ "Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n",
+ mb, num_possible_cpus());
+ mb = num_possible_cpus();
+ }
+
+ if (mb > limit) {
+ printk(KERN_WARNING
+ "Lustre: %d MB is too large for debug buffer size, setting it to %d MB.\n",
+ mb, limit);
+ mb = limit;
+ }
+
+ mb /= num_possible_cpus();
+ pages = mb << (20 - PAGE_CACHE_SHIFT);
+
+ cfs_tracefile_write_lock();
+
+ cfs_tcd_for_each(tcd, i, j)
+ tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+ cfs_tracefile_write_unlock();
+
+ return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob)
+{
+ char str[32];
+ int rc;
+
+ rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+ if (rc < 0)
+ return rc;
+
+ return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+ int i;
+ int j;
+ struct cfs_trace_cpu_data *tcd;
+ int total_pages = 0;
+
+ cfs_tracefile_read_lock();
+
+ cfs_tcd_for_each(tcd, i, j)
+ total_pages += tcd->tcd_max_pages;
+
+ cfs_tracefile_read_unlock();
+
+ return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+ struct page_collection pc;
+ struct tracefiled_ctl *tctl = arg;
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+ struct file *filp;
+ char *buf;
+ int last_loop = 0;
+ int rc;
+
+ DECL_MMSPACE;
+
+ /* we're started late enough that we pick up init's fs context */
+ /* this is so broken in uml? what on earth is going on? */
+
+ spin_lock_init(&pc.pc_lock);
+ complete(&tctl->tctl_start);
+
+ while (1) {
+ wait_queue_t __wait;
+
+ pc.pc_want_daemon_pages = 0;
+ collect_pages(&pc);
+ if (list_empty(&pc.pc_pages))
+ goto end_loop;
+
+ filp = NULL;
+ cfs_tracefile_read_lock();
+ if (cfs_tracefile[0] != 0) {
+ filp = filp_open(cfs_tracefile,
+ O_CREAT | O_RDWR | O_LARGEFILE,
+ 0600);
+ if (IS_ERR(filp)) {
+ rc = PTR_ERR(filp);
+ filp = NULL;
+ printk(KERN_WARNING "couldn't open %s: %d\n",
+ cfs_tracefile, rc);
+ }
+ }
+ cfs_tracefile_read_unlock();
+ if (filp == NULL) {
+ put_pages_on_daemon_list(&pc);
+ __LASSERT(list_empty(&pc.pc_pages));
+ goto end_loop;
+ }
+
+ MMSPACE_OPEN;
+
+ list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+ linkage) {
+ static loff_t f_pos;
+
+ __LASSERT_TAGE_INVARIANT(tage);
+
+ if (f_pos >= (off_t)cfs_tracefile_size)
+ f_pos = 0;
+ else if (f_pos > i_size_read(file_inode(filp)))
+ f_pos = i_size_read(file_inode(filp));
+
+ buf = kmap(tage->page);
+ rc = vfs_write(filp, (__force const char __user *)buf,
+ tage->used, &f_pos);
+ kunmap(tage->page);
+
+ if (rc != (int)tage->used) {
+ printk(KERN_WARNING "wanted to write %u but wrote %d\n",
+ tage->used, rc);
+ put_pages_back(&pc);
+ __LASSERT(list_empty(&pc.pc_pages));
+ break;
+ }
+ }
+ MMSPACE_CLOSE;
+
+ filp_close(filp, NULL);
+ put_pages_on_daemon_list(&pc);
+ if (!list_empty(&pc.pc_pages)) {
+ int i;
+
+ printk(KERN_ALERT "Lustre: trace pages aren't empty\n");
+ pr_err("total cpus(%d): ",
+ num_possible_cpus());
+ for (i = 0; i < num_possible_cpus(); i++)
+ if (cpu_online(i))
+ pr_cont("%d(on) ", i);
+ else
+ pr_cont("%d(off) ", i);
+ pr_cont("\n");
+
+ i = 0;
+ list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+ linkage)
+ pr_err("page %d belongs to cpu %d\n",
+ ++i, tage->cpu);
+ pr_err("There are %d pages unwritten\n", i);
+ }
+ __LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+ if (atomic_read(&tctl->tctl_shutdown)) {
+ if (last_loop == 0) {
+ last_loop = 1;
+ continue;
+ } else {
+ break;
+ }
+ }
+ init_waitqueue_entry(&__wait, current);
+ add_wait_queue(&tctl->tctl_waitq, &__wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ remove_wait_queue(&tctl->tctl_waitq, &__wait);
+ }
+ complete(&tctl->tctl_stop);
+ return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+ struct tracefiled_ctl *tctl = &trace_tctl;
+ int rc = 0;
+
+ mutex_lock(&cfs_trace_thread_mutex);
+ if (thread_running)
+ goto out;
+
+ init_completion(&tctl->tctl_start);
+ init_completion(&tctl->tctl_stop);
+ init_waitqueue_head(&tctl->tctl_waitq);
+ atomic_set(&tctl->tctl_shutdown, 0);
+
+ if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+ rc = -ECHILD;
+ goto out;
+ }
+
+ wait_for_completion(&tctl->tctl_start);
+ thread_running = 1;
+out:
+ mutex_unlock(&cfs_trace_thread_mutex);
+ return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+ struct tracefiled_ctl *tctl = &trace_tctl;
+
+ mutex_lock(&cfs_trace_thread_mutex);
+ if (thread_running) {
+ printk(KERN_INFO
+ "Lustre: shutting down debug daemon thread...\n");
+ atomic_set(&tctl->tctl_shutdown, 1);
+ wait_for_completion(&tctl->tctl_stop);
+ thread_running = 0;
+ }
+ mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+ struct cfs_trace_cpu_data *tcd;
+ int i;
+ int j;
+ int rc;
+ int factor;
+
+ rc = cfs_tracefile_init_arch();
+ if (rc != 0)
+ return rc;
+
+ cfs_tcd_for_each(tcd, i, j) {
+ /* tcd_pages_factor is initialized int tracefile_init_arch. */
+ factor = tcd->tcd_pages_factor;
+ INIT_LIST_HEAD(&tcd->tcd_pages);
+ INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+ INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+ tcd->tcd_cur_pages = 0;
+ tcd->tcd_cur_stock_pages = 0;
+ tcd->tcd_cur_daemon_pages = 0;
+ tcd->tcd_max_pages = (max_pages * factor) / 100;
+ LASSERT(tcd->tcd_max_pages > 0);
+ tcd->tcd_shutting_down = 0;
+ }
+
+ return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+ struct cfs_trace_cpu_data *tcd;
+ struct cfs_trace_page *tage;
+ struct cfs_trace_page *tmp;
+ int i, cpu;
+
+ for_each_possible_cpu(cpu) {
+ cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+ tcd->tcd_shutting_down = 1;
+
+ list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
+ linkage) {
+ __LASSERT_TAGE_INVARIANT(tage);
+
+ list_del(&tage->linkage);
+ cfs_tage_free(tage);
+ }
+
+ tcd->tcd_cur_pages = 0;
+ }
+ }
+}
+
+static void cfs_trace_cleanup(void)
+{
+ struct page_collection pc;
+
+ INIT_LIST_HEAD(&pc.pc_pages);
+ spin_lock_init(&pc.pc_lock);
+
+ trace_cleanup_on_all_cpus();
+
+ cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+ cfs_trace_stop_thread();
+ cfs_trace_cleanup();
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.h b/drivers/staging/lustre/lustre/libcfs/tracefile.h
new file mode 100644
index 000000000..0601476e1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/tracefile.h
@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "linux/linux-tracefile.h"
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+ const char __user *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
+ const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+void cfs_trace_free_string_buffer(char *str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void __user *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int libcfs_panic_in_progress;
+extern int cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE 1024
+
+union cfs_trace_data_union {
+ struct cfs_trace_cpu_data {
+ /*
+ * Even though this structure is meant to be per-CPU, locking
+ * is needed because in some places the data may be accessed
+ * from other CPUs. This lock is directly used in trace_get_tcd
+ * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+ * tcd_for_each_type_lock
+ */
+ spinlock_t tcd_lock;
+ unsigned long tcd_lock_flags;
+
+ /*
+ * pages with trace records not yet processed by tracefiled.
+ */
+ struct list_head tcd_pages;
+ /* number of pages on ->tcd_pages */
+ unsigned long tcd_cur_pages;
+
+ /*
+ * pages with trace records already processed by
+ * tracefiled. These pages are kept in memory, so that some
+ * portion of log can be written in the event of LBUG. This
+ * list is maintained in LRU order.
+ *
+ * Pages are moved to ->tcd_daemon_pages by tracefiled()
+ * (put_pages_on_daemon_list()). LRU pages from this list are
+ * discarded when list grows too large.
+ */
+ struct list_head tcd_daemon_pages;
+ /* number of pages on ->tcd_daemon_pages */
+ unsigned long tcd_cur_daemon_pages;
+
+ /*
+ * Maximal number of pages allowed on ->tcd_pages and
+ * ->tcd_daemon_pages each.
+ * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+ * implementation.
+ */
+ unsigned long tcd_max_pages;
+
+ /*
+ * preallocated pages to write trace records into. Pages from
+ * ->tcd_stock_pages are moved to ->tcd_pages by
+ * portals_debug_msg().
+ *
+ * This list is necessary, because on some platforms it's
+ * impossible to perform efficient atomic page allocation in a
+ * non-blockable context.
+ *
+ * Such platforms fill ->tcd_stock_pages "on occasion", when
+ * tracing code is entered in blockable context.
+ *
+ * trace_get_tage_try() tries to get a page from
+ * ->tcd_stock_pages first and resorts to atomic page
+ * allocation only if this queue is empty. ->tcd_stock_pages
+ * is replenished when tracing code is entered in blocking
+ * context (darwin-tracefile.c:trace_get_tcd()). We try to
+ * maintain TCD_STOCK_PAGES (40 by default) pages in this
+ * queue. Atomic allocation is only required if more than
+ * TCD_STOCK_PAGES pagesful are consumed by trace records all
+ * emitted in non-blocking contexts. Which is quite unlikely.
+ */
+ struct list_head tcd_stock_pages;
+ /* number of pages on ->tcd_stock_pages */
+ unsigned long tcd_cur_stock_pages;
+
+ unsigned short tcd_shutting_down;
+ unsigned short tcd_cpu;
+ unsigned short tcd_type;
+ /* The factors to share debug memory. */
+ unsigned short tcd_pages_factor;
+ } tcd;
+ char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES 8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j) \
+ for (i = 0; cfs_trace_data[i] != NULL; i++) \
+ for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd); \
+ j < num_possible_cpus(); \
+ j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu) \
+ for (i = 0; cfs_trace_data[i] && \
+ (tcd = &(*cfs_trace_data[i])[cpu].tcd) && \
+ cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+ struct list_head pc_pages;
+ /*
+ * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+ * call-back functions. XXX nikita: Which is horrible: all processors
+ * receive NMI at the same time only to be serialized by this
+ * lock. Probably ->pc_pages should be replaced with an array of
+ * NR_CPUS elements accessed locklessly.
+ */
+ spinlock_t pc_lock;
+ /*
+ * if this flag is set, collect_pages() will spill both
+ * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+ * only ->tcd_pages are spilled.
+ */
+ int pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+ struct completion tctl_start;
+ struct completion tctl_stop;
+ wait_queue_head_t tctl_waitq;
+ pid_t tctl_pid;
+ atomic_t tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+ /*
+ * page itself
+ */
+ struct page *page;
+ /*
+ * linkage into one of the lists in trace_data_union or
+ * page_collection
+ */
+ struct list_head linkage;
+ /*
+ * number of bytes used within this page
+ */
+ unsigned int used;
+ /*
+ * cpu that owns this page
+ */
+ unsigned short cpu;
+ /*
+ * type(context) of this page
+ */
+ unsigned short type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+ struct libcfs_debug_msg_data *m,
+ unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+ const char *buf, int len, const char *file,
+ const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+/**
+ * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][]
+ * are not public libcfs API; they should be defined in
+ * platform-specific tracefile include files
+ * (see, for example, linux-tracefile.h).
+ */
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void);
+
+static inline char *
+cfs_trace_get_console_buffer(void)
+{
+ unsigned int i = get_cpu();
+ unsigned int j = cfs_trace_buf_idx_get();
+
+ return cfs_trace_console_buffers[i][j];
+}
+
+static inline void
+cfs_trace_put_console_buffer(char *buffer)
+{
+ put_cpu();
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+ struct cfs_trace_cpu_data *tcd =
+ &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+ cfs_trace_lock_tcd(tcd, 0);
+
+ return tcd;
+}
+
+static inline void
+cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd)
+{
+ cfs_trace_unlock_tcd(tcd, 0);
+
+ put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp,
+ struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+ struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+ struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond) \
+do { \
+ if (unlikely(!(cond))) { \
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \
+ cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+ &msgdata); \
+ } \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage) \
+do { \
+ __LASSERT(tage != NULL); \
+ __LASSERT(tage->page != NULL); \
+ __LASSERT(tage->used <= PAGE_CACHE_SIZE); \
+ __LASSERT(page_count(tage->page) > 0); \
+} while (0)
+
+#endif /* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/drivers/staging/lustre/lustre/libcfs/workitem.c b/drivers/staging/lustre/lustre/libcfs/workitem.c
new file mode 100644
index 000000000..48009b775
--- /dev/null
+++ b/drivers/staging/lustre/lustre/libcfs/workitem.c
@@ -0,0 +1,479 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Liang Zhen <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#define CFS_WS_NAME_LEN 16
+
+typedef struct cfs_wi_sched {
+ struct list_head ws_list; /* chain on global list */
+ /** serialised workitems */
+ spinlock_t ws_lock;
+ /** where schedulers sleep */
+ wait_queue_head_t ws_waitq;
+ /** concurrent workitems */
+ struct list_head ws_runq;
+ /** rescheduled running-workitems, a workitem can be rescheduled
+ * while running in wi_action(), but we don't to execute it again
+ * unless it returns from wi_action(), so we put it on ws_rerunq
+ * while rescheduling, and move it to runq after it returns
+ * from wi_action() */
+ struct list_head ws_rerunq;
+ /** CPT-table for this scheduler */
+ struct cfs_cpt_table *ws_cptab;
+ /** CPT id for affinity */
+ int ws_cpt;
+ /** number of scheduled workitems */
+ int ws_nscheduled;
+ /** started scheduler thread, protected by cfs_wi_data::wi_glock */
+ unsigned int ws_nthreads:30;
+ /** shutting down, protected by cfs_wi_data::wi_glock */
+ unsigned int ws_stopping:1;
+ /** serialize starting thread, protected by cfs_wi_data::wi_glock */
+ unsigned int ws_starting:1;
+ /** scheduler name */
+ char ws_name[CFS_WS_NAME_LEN];
+} cfs_wi_sched_t;
+
+static struct cfs_workitem_data {
+ /** serialize */
+ spinlock_t wi_glock;
+ /** list of all schedulers */
+ struct list_head wi_scheds;
+ /** WI module is initialized */
+ int wi_init;
+ /** shutting down the whole WI module */
+ int wi_stopping;
+} cfs_wi_data;
+
+static inline void
+cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+{
+ spin_lock(&sched->ws_lock);
+}
+
+static inline void
+cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
+{
+ spin_unlock(&sched->ws_lock);
+}
+
+static inline int
+cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
+{
+ cfs_wi_sched_lock(sched);
+ if (sched->ws_stopping) {
+ cfs_wi_sched_unlock(sched);
+ return 0;
+ }
+
+ if (!list_empty(&sched->ws_runq)) {
+ cfs_wi_sched_unlock(sched);
+ return 0;
+ }
+ cfs_wi_sched_unlock(sched);
+ return 1;
+}
+
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+ LASSERT(!in_interrupt()); /* because we use plain spinlock */
+ LASSERT(!sched->ws_stopping);
+
+ cfs_wi_sched_lock(sched);
+
+ LASSERT(wi->wi_running);
+ if (wi->wi_scheduled) { /* cancel pending schedules */
+ LASSERT(!list_empty(&wi->wi_list));
+ list_del_init(&wi->wi_list);
+
+ LASSERT(sched->ws_nscheduled > 0);
+ sched->ws_nscheduled--;
+ }
+
+ LASSERT(list_empty(&wi->wi_list));
+
+ wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+ cfs_wi_sched_unlock(sched);
+
+ return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+ int rc;
+
+ LASSERT(!in_interrupt()); /* because we use plain spinlock */
+ LASSERT(!sched->ws_stopping);
+
+ /*
+ * return 0 if it's running already, otherwise return 1, which
+ * means the workitem will not be scheduled and will not have
+ * any race with wi_action.
+ */
+ cfs_wi_sched_lock(sched);
+
+ rc = !(wi->wi_running);
+
+ if (wi->wi_scheduled) { /* cancel pending schedules */
+ LASSERT(!list_empty(&wi->wi_list));
+ list_del_init(&wi->wi_list);
+
+ LASSERT(sched->ws_nscheduled > 0);
+ sched->ws_nscheduled--;
+
+ wi->wi_scheduled = 0;
+ }
+
+ LASSERT (list_empty(&wi->wi_list));
+
+ cfs_wi_sched_unlock(sched);
+ return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+ LASSERT(!in_interrupt()); /* because we use plain spinlock */
+ LASSERT(!sched->ws_stopping);
+
+ cfs_wi_sched_lock(sched);
+
+ if (!wi->wi_scheduled) {
+ LASSERT (list_empty(&wi->wi_list));
+
+ wi->wi_scheduled = 1;
+ sched->ws_nscheduled++;
+ if (!wi->wi_running) {
+ list_add_tail(&wi->wi_list, &sched->ws_runq);
+ wake_up(&sched->ws_waitq);
+ } else {
+ list_add(&wi->wi_list, &sched->ws_rerunq);
+ }
+ }
+
+ LASSERT (!list_empty(&wi->wi_list));
+ cfs_wi_sched_unlock(sched);
+ return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+
+static int
+cfs_wi_scheduler (void *arg)
+{
+ struct cfs_wi_sched *sched = (cfs_wi_sched_t *)arg;
+
+ cfs_block_allsigs();
+
+ /* CPT affinity scheduler? */
+ if (sched->ws_cptab != NULL)
+ cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
+
+ spin_lock(&cfs_wi_data.wi_glock);
+
+ LASSERT(sched->ws_starting == 1);
+ sched->ws_starting--;
+ sched->ws_nthreads++;
+
+ spin_unlock(&cfs_wi_data.wi_glock);
+
+ cfs_wi_sched_lock(sched);
+
+ while (!sched->ws_stopping) {
+ int nloops = 0;
+ int rc;
+ cfs_workitem_t *wi;
+
+ while (!list_empty(&sched->ws_runq) &&
+ nloops < CFS_WI_RESCHED) {
+ wi = list_entry(sched->ws_runq.next,
+ cfs_workitem_t, wi_list);
+ LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+ list_del_init(&wi->wi_list);
+
+ LASSERT(sched->ws_nscheduled > 0);
+ sched->ws_nscheduled--;
+
+ wi->wi_running = 1;
+ wi->wi_scheduled = 0;
+
+
+ cfs_wi_sched_unlock(sched);
+ nloops++;
+
+ rc = (*wi->wi_action) (wi);
+
+ cfs_wi_sched_lock(sched);
+ if (rc != 0) /* WI should be dead, even be freed! */
+ continue;
+
+ wi->wi_running = 0;
+ if (list_empty(&wi->wi_list))
+ continue;
+
+ LASSERT(wi->wi_scheduled);
+ /* wi is rescheduled, should be on rerunq now, we
+ * move it to runq so it can run action now */
+ list_move_tail(&wi->wi_list, &sched->ws_runq);
+ }
+
+ if (!list_empty(&sched->ws_runq)) {
+ cfs_wi_sched_unlock(sched);
+ /* don't sleep because some workitems still
+ * expect me to come back soon */
+ cond_resched();
+ cfs_wi_sched_lock(sched);
+ continue;
+ }
+
+ cfs_wi_sched_unlock(sched);
+ rc = wait_event_interruptible_exclusive(sched->ws_waitq,
+ !cfs_wi_sched_cansleep(sched));
+ cfs_wi_sched_lock(sched);
+ }
+
+ cfs_wi_sched_unlock(sched);
+
+ spin_lock(&cfs_wi_data.wi_glock);
+ sched->ws_nthreads--;
+ spin_unlock(&cfs_wi_data.wi_glock);
+
+ return 0;
+}
+
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+ int i;
+
+ LASSERT(cfs_wi_data.wi_init);
+ LASSERT(!cfs_wi_data.wi_stopping);
+
+ spin_lock(&cfs_wi_data.wi_glock);
+ if (sched->ws_stopping) {
+ CDEBUG(D_INFO, "%s is in progress of stopping\n",
+ sched->ws_name);
+ spin_unlock(&cfs_wi_data.wi_glock);
+ return;
+ }
+
+ LASSERT(!list_empty(&sched->ws_list));
+ sched->ws_stopping = 1;
+
+ spin_unlock(&cfs_wi_data.wi_glock);
+
+ i = 2;
+ wake_up_all(&sched->ws_waitq);
+
+ spin_lock(&cfs_wi_data.wi_glock);
+ while (sched->ws_nthreads > 0) {
+ CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
+ "waiting for %d threads of WI sched[%s] to terminate\n",
+ sched->ws_nthreads, sched->ws_name);
+
+ spin_unlock(&cfs_wi_data.wi_glock);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1) / 20);
+ spin_lock(&cfs_wi_data.wi_glock);
+ }
+
+ list_del(&sched->ws_list);
+
+ spin_unlock(&cfs_wi_data.wi_glock);
+ LASSERT(sched->ws_nscheduled == 0);
+
+ LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+ int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+ struct cfs_wi_sched *sched;
+ int rc;
+
+ LASSERT(cfs_wi_data.wi_init);
+ LASSERT(!cfs_wi_data.wi_stopping);
+ LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+ (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+ LIBCFS_ALLOC(sched, sizeof(*sched));
+ if (sched == NULL)
+ return -ENOMEM;
+
+ strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+ sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0';
+ sched->ws_cptab = cptab;
+ sched->ws_cpt = cpt;
+
+ spin_lock_init(&sched->ws_lock);
+ init_waitqueue_head(&sched->ws_waitq);
+ INIT_LIST_HEAD(&sched->ws_runq);
+ INIT_LIST_HEAD(&sched->ws_rerunq);
+ INIT_LIST_HEAD(&sched->ws_list);
+
+ rc = 0;
+ while (nthrs > 0) {
+ char name[16];
+ struct task_struct *task;
+
+ spin_lock(&cfs_wi_data.wi_glock);
+ while (sched->ws_starting > 0) {
+ spin_unlock(&cfs_wi_data.wi_glock);
+ schedule();
+ spin_lock(&cfs_wi_data.wi_glock);
+ }
+
+ sched->ws_starting++;
+ spin_unlock(&cfs_wi_data.wi_glock);
+
+ if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+ snprintf(name, sizeof(name), "%s_%02d_%02u",
+ sched->ws_name, sched->ws_cpt,
+ sched->ws_nthreads);
+ } else {
+ snprintf(name, sizeof(name), "%s_%02u",
+ sched->ws_name, sched->ws_nthreads);
+ }
+
+ task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
+ if (!IS_ERR(task)) {
+ nthrs--;
+ continue;
+ }
+ rc = PTR_ERR(task);
+
+ CERROR("Failed to create thread for WI scheduler %s: %d\n",
+ name, rc);
+
+ spin_lock(&cfs_wi_data.wi_glock);
+
+ /* make up for cfs_wi_sched_destroy */
+ list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+ sched->ws_starting--;
+
+ spin_unlock(&cfs_wi_data.wi_glock);
+
+ cfs_wi_sched_destroy(sched);
+ return rc;
+ }
+ spin_lock(&cfs_wi_data.wi_glock);
+ list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+ spin_unlock(&cfs_wi_data.wi_glock);
+
+ *sched_pp = sched;
+ return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+ memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
+
+ spin_lock_init(&cfs_wi_data.wi_glock);
+ INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+ cfs_wi_data.wi_init = 1;
+
+ return 0;
+}
+
+void
+cfs_wi_shutdown(void)
+{
+ struct cfs_wi_sched *sched;
+
+ spin_lock(&cfs_wi_data.wi_glock);
+ cfs_wi_data.wi_stopping = 1;
+ spin_unlock(&cfs_wi_data.wi_glock);
+
+ /* nobody should contend on this list */
+ list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+ sched->ws_stopping = 1;
+ wake_up_all(&sched->ws_waitq);
+ }
+
+ list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+ spin_lock(&cfs_wi_data.wi_glock);
+
+ while (sched->ws_nthreads != 0) {
+ spin_unlock(&cfs_wi_data.wi_glock);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1) / 20);
+ spin_lock(&cfs_wi_data.wi_glock);
+ }
+ spin_unlock(&cfs_wi_data.wi_glock);
+ }
+ while (!list_empty(&cfs_wi_data.wi_scheds)) {
+ sched = list_entry(cfs_wi_data.wi_scheds.next,
+ struct cfs_wi_sched, ws_list);
+ list_del(&sched->ws_list);
+ LIBCFS_FREE(sched, sizeof(*sched));
+ }
+
+ cfs_wi_data.wi_stopping = 0;
+ cfs_wi_data.wi_init = 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile
new file mode 100644
index 000000000..7d70115d5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_LUSTRE_FS) += lustre.o
+obj-$(CONFIG_LUSTRE_LLITE_LLOOP) += llite_lloop.o
+lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \
+ rw.o namei.o symlink.o llite_mmap.o \
+ xattr.o xattr_cache.o remote_perm.o llite_rmtacl.o llite_capa.o \
+ rw26.o super25.o statahead.o \
+ ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \
+ vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+
+lustre-$(CONFIG_PROC_FS) += lproc_llite.o
+llite_lloop-y := lloop.o
diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c
new file mode 100644
index 000000000..5af013513
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/dcache.c
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_dlm.h"
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+ struct ll_dentry_data *lld;
+
+ lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+ OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+ struct ll_dentry_data *lld;
+
+ LASSERT(de != NULL);
+ lld = ll_d2d(de);
+ if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+ return;
+
+ if (lld->lld_it) {
+ ll_intent_release(lld->lld_it);
+ OBD_FREE(lld->lld_it, sizeof(*lld->lld_it));
+ }
+
+ de->d_fsdata = NULL;
+ call_rcu(&lld->lld_rcu_head, free_dentry_data);
+}
+
+/* Compare if two dentries are the same. Don't match if the existing dentry
+ * is marked invalid. Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it(). The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup(). */
+static int ll_dcompare(const struct dentry *parent, const struct dentry *dentry,
+ unsigned int len, const char *str,
+ const struct qstr *name)
+{
+ if (len != name->len)
+ return 1;
+
+ if (memcmp(str, name->name, len))
+ return 1;
+
+ CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+ name->len, name->name, dentry, dentry->d_flags,
+ d_count(dentry));
+
+ /* mountpoint is always valid */
+ if (d_mountpoint((struct dentry *)dentry))
+ return 0;
+
+ if (d_lustre_invalid(dentry))
+ return 1;
+
+ return 0;
+}
+
+static inline int return_if_equal(struct ldlm_lock *lock, void *data)
+{
+ if ((lock->l_flags &
+ (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) ==
+ (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA))
+ return LDLM_ITER_CONTINUE;
+ return LDLM_ITER_STOP;
+}
+
+/* find any ldlm lock of the inode in mdc and lov
+ * return 0 not find
+ * 1 find one
+ * < 0 error */
+static int find_cbdata(struct inode *inode)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct lov_stripe_md *lsm;
+ int rc = 0;
+
+ LASSERT(inode);
+ rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
+ return_if_equal, NULL);
+ if (rc != 0)
+ return rc;
+
+ lsm = ccc_inode_lsm_get(inode);
+ if (lsm == NULL)
+ return rc;
+
+ rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL);
+ ccc_inode_lsm_put(inode, lsm);
+
+ return rc;
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(const struct dentry *de)
+{
+ LASSERT(de);
+
+ CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n",
+ d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping",
+ de, de, de->d_parent, d_inode(de),
+ d_unhashed(de) ? "" : "hashed,",
+ list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+ /* kernel >= 2.6.38 last refcount is decreased after this function. */
+ LASSERT(d_count(de) == 1);
+
+ /* Disable this piece of code temporarily because this is called
+ * inside dcache_lock so it's not appropriate to do lots of work
+ * here. ATTENTION: Before this piece of code enabling, LU-2487 must be
+ * resolved. */
+#if 0
+ /* if not ldlm lock for this inode, set i_nlink to 0 so that
+ * this inode can be recycled later b=20433 */
+ if (d_really_is_positive(de) && !find_cbdata(d_inode(de)))
+ clear_nlink(d_inode(de));
+#endif
+
+ if (d_lustre_invalid((struct dentry *)de))
+ return 1;
+ return 0;
+}
+
+int ll_d_init(struct dentry *de)
+{
+ LASSERT(de != NULL);
+
+ CDEBUG(D_DENTRY, "ldd on dentry %pd (%p) parent %p inode %p refc %d\n",
+ de, de, de->d_parent, d_inode(de),
+ d_count(de));
+
+ if (de->d_fsdata == NULL) {
+ struct ll_dentry_data *lld;
+
+ lld = kzalloc(sizeof(*lld), GFP_NOFS);
+ if (likely(lld)) {
+ spin_lock(&de->d_lock);
+ if (likely(de->d_fsdata == NULL)) {
+ de->d_fsdata = lld;
+ __d_lustre_invalidate(de);
+ } else {
+ OBD_FREE_PTR(lld);
+ }
+ spin_unlock(&de->d_lock);
+ } else {
+ return -ENOMEM;
+ }
+ }
+ LASSERT(de->d_op == &ll_d_ops);
+
+ return 0;
+}
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+ if (it->it_op && it->d.lustre.it_lock_mode) {
+ struct lustre_handle handle;
+
+ handle.cookie = it->d.lustre.it_lock_handle;
+
+ CDEBUG(D_DLMTRACE, "releasing lock with cookie %#llx from it %p\n",
+ handle.cookie, it);
+ ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode);
+
+ /* bug 494: intent_release may be called multiple times, from
+ * this thread and we don't want to double-decref this lock */
+ it->d.lustre.it_lock_mode = 0;
+ if (it->d.lustre.it_remote_lock_mode != 0) {
+ handle.cookie = it->d.lustre.it_remote_lock_handle;
+
+ CDEBUG(D_DLMTRACE, "releasing remote lock with cookie%#llx from it %p\n",
+ handle.cookie, it);
+ ldlm_lock_decref(&handle,
+ it->d.lustre.it_remote_lock_mode);
+ it->d.lustre.it_remote_lock_mode = 0;
+ }
+ }
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+ CDEBUG(D_INFO, "intent %p released\n", it);
+ ll_intent_drop_lock(it);
+ /* We are still holding extra reference on a request, need to free it */
+ if (it_disposition(it, DISP_ENQ_OPEN_REF))
+ ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */
+
+ if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+ ptlrpc_req_finished(it->d.lustre.it_data);
+
+ it->d.lustre.it_disposition = 0;
+ it->d.lustre.it_data = NULL;
+}
+
+void ll_invalidate_aliases(struct inode *inode)
+{
+ struct dentry *dentry;
+ struct ll_d_hlist_node *p;
+
+ LASSERT(inode != NULL);
+
+ CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
+ inode->i_ino, inode->i_generation, inode);
+
+ ll_lock_dcache(inode);
+ ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_u.d_alias) {
+ CDEBUG(D_DENTRY, "dentry in drop %pd (%p) parent %p inode %p flags %d\n",
+ dentry, dentry, dentry->d_parent,
+ d_inode(dentry), dentry->d_flags);
+
+ d_lustre_invalidate(dentry, 0);
+ }
+ ll_unlock_dcache(inode);
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+ struct lookup_intent *it,
+ struct inode *inode)
+{
+ int rc = 0;
+
+ if (!request)
+ return 0;
+
+ if (it_disposition(it, DISP_LOOKUP_NEG))
+ return -ENOENT;
+
+ rc = ll_prep_inode(&inode, request, NULL, it);
+
+ return rc;
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode)
+{
+ LASSERT(it != NULL);
+
+ if (it->d.lustre.it_lock_mode && inode != NULL) {
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+ CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+ inode, inode->i_ino, inode->i_generation);
+ ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+ }
+
+ /* drop lookup or getattr locks immediately */
+ if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
+ /* on 2.6 there are situation when several lookups and
+ * revalidations may be requested during single operation.
+ * therefore, we don't release intent here -bzzz */
+ ll_intent_drop_lock(it);
+ }
+}
+
+static int ll_revalidate_dentry(struct dentry *dentry,
+ unsigned int lookup_flags)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+
+ /*
+ * if open&create is set, talk to MDS to make sure file is created if
+ * necessary, because we can't do this in ->open() later since that's
+ * called on an inode. return 0 here to let lookup to handle this.
+ */
+ if ((lookup_flags & (LOOKUP_OPEN | LOOKUP_CREATE)) ==
+ (LOOKUP_OPEN | LOOKUP_CREATE))
+ return 0;
+
+ if (lookup_flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE))
+ return 1;
+
+ if (d_need_statahead(dir, dentry) <= 0)
+ return 1;
+
+ if (lookup_flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ do_statahead_enter(dir, &dentry, d_inode(dentry) == NULL);
+ ll_statahead_mark(dir, dentry);
+ return 1;
+}
+
+/*
+ * Always trust cached dentries. Update statahead window if necessary.
+ */
+static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags)
+{
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, flags=%u\n",
+ dentry, flags);
+
+ return ll_revalidate_dentry(dentry, flags);
+}
+
+
+static void ll_d_iput(struct dentry *de, struct inode *inode)
+{
+ LASSERT(inode);
+ if (!find_cbdata(inode))
+ clear_nlink(inode);
+ iput(inode);
+}
+
+const struct dentry_operations ll_d_ops = {
+ .d_revalidate = ll_revalidate_nd,
+ .d_release = ll_release,
+ .d_delete = ll_ddelete,
+ .d_iput = ll_d_iput,
+ .d_compare = ll_dcompare,
+};
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
new file mode 100644
index 000000000..a5bc694dc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -0,0 +1,1971 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/buffer_head.h> /* for wait_on_buffer */
+#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_fid.h"
+#include "llite_internal.h"
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ * . it implies that byte offset to the directory entry serves as a
+ * telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ * ext3/htree directory entries may move due to splits, and more
+ * importantly,
+ *
+ * . it is incompatible with the design of split directories for cmd3,
+ * that assumes that names are distributed across nodes based on their
+ * hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ * . hash is not unique, so it cannot be used to index cached directory
+ * pages on the client (note, that it requires a whole pageful of hash
+ * collided entries to cause two pages to have identical hashes);
+ *
+ * . hash is not unique, so it cannot, strictly speaking, be used as an
+ * entry cookie. ext3/htree has the same problem and lustre implementation
+ * mimics their solution: seekdir(hash) positions directory at the first
+ * entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Solution to index uniqueness problem is to not cache overflow
+ * pages. Instead, when page hash collision is detected, all overflow pages
+ * from emerging chain are immediately requested from the server and placed in
+ * a special data structure (struct ll_dir_chain). This data structure is used
+ * by ll_readdir() to process entries from overflow pages. When readdir
+ * invocation finishes, overflow pages are discarded. If page hash collision
+ * chain weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next portion of
+ * entries and again discard the pages. This is not as wasteful as it looks,
+ * because, given reasonable hash, page hash collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * lmv_adjust_dirpages().
+ *
+ */
+
+/* returns the page unlocked, but with a reference */
+static int ll_dir_filler(void *_hash, struct page *page0)
+{
+ struct inode *inode = page0->mapping->host;
+ int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
+ struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
+ struct ptlrpc_request *request;
+ struct mdt_body *body;
+ struct md_op_data *op_data;
+ __u64 hash = *((__u64 *)_hash);
+ struct page **page_pool;
+ struct page *page;
+ struct lu_dirpage *dp;
+ int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
+ int nrdpgs = 0; /* number of pages read actually */
+ int npages;
+ int i;
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash %llu\n",
+ inode->i_ino, inode->i_generation, inode, hash);
+
+ LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
+
+ page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS);
+ if (page_pool) {
+ page_pool[0] = page0;
+ } else {
+ page_pool = &page0;
+ max_pages = 1;
+ }
+ for (npages = 1; npages < max_pages; npages++) {
+ page = page_cache_alloc_cold(inode->i_mapping);
+ if (!page)
+ break;
+ page_pool[npages] = page;
+ }
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ op_data->op_npages = npages;
+ op_data->op_offset = hash;
+ rc = md_readpage(exp, op_data, page_pool, &request);
+ ll_finish_md_op_data(op_data);
+ if (rc < 0) {
+ /* page0 is special, which was added into page cache early */
+ delete_from_page_cache(page0);
+ } else if (rc == 0) {
+ body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+ /* Checked by mdc_readpage() */
+ LASSERT(body != NULL);
+
+ if (body->valid & OBD_MD_FLSIZE)
+ cl_isize_write(inode, body->size);
+
+ nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
+ >> PAGE_CACHE_SHIFT;
+ SetPageUptodate(page0);
+ }
+ unlock_page(page0);
+ ptlrpc_req_finished(request);
+
+ CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);
+
+ ll_pagevec_init(&lru_pvec, 0);
+ for (i = 1; i < npages; i++) {
+ unsigned long offset;
+ int ret;
+
+ page = page_pool[i];
+
+ if (rc < 0 || i >= nrdpgs) {
+ page_cache_release(page);
+ continue;
+ }
+
+ SetPageUptodate(page);
+
+ dp = kmap(page);
+ hash = le64_to_cpu(dp->ldp_hash_start);
+ kunmap(page);
+
+ offset = hash_x_index(hash, hash64);
+
+ prefetchw(&page->flags);
+ ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+ GFP_KERNEL);
+ if (ret == 0) {
+ unlock_page(page);
+ if (ll_pagevec_add(&lru_pvec, page) == 0)
+ ll_pagevec_lru_add_file(&lru_pvec);
+ } else {
+ CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: %d\n",
+ offset, ret);
+ }
+ page_cache_release(page);
+ }
+ ll_pagevec_lru_add_file(&lru_pvec);
+
+ if (page_pool != &page0)
+ OBD_FREE(page_pool, sizeof(struct page *) * max_pages);
+ return rc;
+}
+
+static void ll_check_page(struct inode *dir, struct page *page)
+{
+ /* XXX: check page format later */
+ SetPageChecked(page);
+}
+
+void ll_release_page(struct page *page, int remove)
+{
+ kunmap(page);
+ if (remove) {
+ lock_page(page);
+ if (likely(page->mapping != NULL))
+ truncate_complete_page(page->mapping, page);
+ unlock_page(page);
+ }
+ page_cache_release(page);
+}
+
+/*
+ * Find, kmap and return page that contains given hash.
+ */
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
+ __u64 *start, __u64 *end)
+{
+ int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+ struct address_space *mapping = dir->i_mapping;
+ /*
+ * Complement of hash is used as an index so that
+ * radix_tree_gang_lookup() can be used to find a page with starting
+ * hash _smaller_ than one we are looking for.
+ */
+ unsigned long offset = hash_x_index(*hash, hash64);
+ struct page *page;
+ int found;
+
+ spin_lock_irq(&mapping->tree_lock);
+ found = radix_tree_gang_lookup(&mapping->page_tree,
+ (void **)&page, offset, 1);
+ if (found > 0 && !radix_tree_exceptional_entry(page)) {
+ struct lu_dirpage *dp;
+
+ page_cache_get(page);
+ spin_unlock_irq(&mapping->tree_lock);
+ /*
+ * In contrast to find_lock_page() we are sure that directory
+ * page cannot be truncated (while DLM lock is held) and,
+ * hence, can avoid restart.
+ *
+ * In fact, page cannot be locked here at all, because
+ * ll_dir_filler() does synchronous io.
+ */
+ wait_on_page_locked(page);
+ if (PageUptodate(page)) {
+ dp = kmap(page);
+ if (BITS_PER_LONG == 32 && hash64) {
+ *start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+ *end = le64_to_cpu(dp->ldp_hash_end) >> 32;
+ *hash = *hash >> 32;
+ } else {
+ *start = le64_to_cpu(dp->ldp_hash_start);
+ *end = le64_to_cpu(dp->ldp_hash_end);
+ }
+ LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n",
+ *start, *end, *hash);
+ CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash %llu\n",
+ offset, *start, *end, *hash);
+ if (*hash > *end) {
+ ll_release_page(page, 0);
+ page = NULL;
+ } else if (*end != *start && *hash == *end) {
+ /*
+ * upon hash collision, remove this page,
+ * otherwise put page reference, and
+ * ll_get_dir_page() will issue RPC to fetch
+ * the page we want.
+ */
+ ll_release_page(page,
+ le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+ page = NULL;
+ }
+ } else {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+
+ } else {
+ spin_unlock_irq(&mapping->tree_lock);
+ page = NULL;
+ }
+ return page;
+}
+
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+ struct ll_dir_chain *chain)
+{
+ ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
+ struct address_space *mapping = dir->i_mapping;
+ struct lustre_handle lockh;
+ struct lu_dirpage *dp;
+ struct page *page;
+ ldlm_mode_t mode;
+ int rc;
+ __u64 start = 0;
+ __u64 end = 0;
+ __u64 lhash = hash;
+ struct ll_inode_info *lli = ll_i2info(dir);
+ int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+
+ mode = LCK_PR;
+ rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
+ ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
+ if (!rc) {
+ struct ldlm_enqueue_info einfo = {
+ .ei_type = LDLM_IBITS,
+ .ei_mode = mode,
+ .ei_cb_bl = ll_md_blocking_ast,
+ .ei_cb_cp = ldlm_completion_ast,
+ };
+ struct lookup_intent it = { .it_op = IT_READDIR };
+ struct ptlrpc_request *request;
+ struct md_op_data *op_data;
+
+ op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return (void *)op_data;
+
+ rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
+ op_data, &lockh, NULL, 0, NULL, 0);
+
+ ll_finish_md_op_data(op_data);
+
+ request = (struct ptlrpc_request *)it.d.lustre.it_data;
+ if (request)
+ ptlrpc_req_finished(request);
+ if (rc < 0) {
+ CERROR("lock enqueue: "DFID" at %llu: rc %d\n",
+ PFID(ll_inode2fid(dir)), hash, rc);
+ return ERR_PTR(rc);
+ }
+
+ CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
+ dir, dir->i_ino, dir->i_generation);
+ md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
+ &it.d.lustre.it_lock_handle, dir, NULL);
+ } else {
+ /* for cross-ref object, l_ast_data of the lock may not be set,
+ * we reset it here */
+ md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
+ dir, NULL);
+ }
+ ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+ mutex_lock(&lli->lli_readdir_mutex);
+ page = ll_dir_page_locate(dir, &lhash, &start, &end);
+ if (IS_ERR(page)) {
+ CERROR("dir page locate: "DFID" at %llu: rc %ld\n",
+ PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
+ goto out_unlock;
+ } else if (page != NULL) {
+ /*
+ * XXX nikita: not entirely correct handling of a corner case:
+ * suppose hash chain of entries with hash value HASH crosses
+ * border between pages P0 and P1. First both P0 and P1 are
+ * cached, seekdir() is called for some entry from the P0 part
+ * of the chain. Later P0 goes out of cache. telldir(HASH)
+ * happens and finds P1, as it starts with matching hash
+ * value. Remaining entries from P0 part of the chain are
+ * skipped. (Is that really a bug?)
+ *
+ * Possible solutions: 0. don't cache P1 is such case, handle
+ * it as an "overflow" page. 1. invalidate all pages at
+ * once. 2. use HASH|1 as an index for P1.
+ */
+ goto hash_collision;
+ }
+
+ page = read_cache_page(mapping, hash_x_index(hash, hash64),
+ ll_dir_filler, &lhash);
+ if (IS_ERR(page)) {
+ CERROR("read cache page: "DFID" at %llu: rc %ld\n",
+ PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+ goto out_unlock;
+ }
+
+ wait_on_page_locked(page);
+ (void)kmap(page);
+ if (!PageUptodate(page)) {
+ CERROR("page not updated: "DFID" at %llu: rc %d\n",
+ PFID(ll_inode2fid(dir)), hash, -5);
+ goto fail;
+ }
+ if (!PageChecked(page))
+ ll_check_page(dir, page);
+ if (PageError(page)) {
+ CERROR("page error: "DFID" at %llu: rc %d\n",
+ PFID(ll_inode2fid(dir)), hash, -5);
+ goto fail;
+ }
+hash_collision:
+ dp = page_address(page);
+ if (BITS_PER_LONG == 32 && hash64) {
+ start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+ end = le64_to_cpu(dp->ldp_hash_end) >> 32;
+ lhash = hash >> 32;
+ } else {
+ start = le64_to_cpu(dp->ldp_hash_start);
+ end = le64_to_cpu(dp->ldp_hash_end);
+ lhash = hash;
+ }
+ if (end == start) {
+ LASSERT(start == lhash);
+ CWARN("Page-wide hash collision: %llu\n", end);
+ if (BITS_PER_LONG == 32 && hash64)
+ CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n",
+ le64_to_cpu(dp->ldp_hash_start),
+ le64_to_cpu(dp->ldp_hash_end), hash);
+ /*
+ * Fetch whole overflow chain...
+ *
+ * XXX not yet.
+ */
+ goto fail;
+ }
+out_unlock:
+ mutex_unlock(&lli->lli_readdir_mutex);
+ ldlm_lock_decref(&lockh, mode);
+ return page;
+
+fail:
+ ll_release_page(page, 1);
+ page = ERR_PTR(-EIO);
+ goto out_unlock;
+}
+
+int ll_dir_read(struct inode *inode, struct dir_context *ctx)
+{
+ struct ll_inode_info *info = ll_i2info(inode);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ __u64 pos = ctx->pos;
+ int api32 = ll_need_32bit_api(sbi);
+ int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH;
+ struct page *page;
+ struct ll_dir_chain chain;
+ int done = 0;
+ int rc = 0;
+
+ ll_dir_chain_init(&chain);
+
+ page = ll_get_dir_page(inode, pos, &chain);
+
+ while (rc == 0 && !done) {
+ struct lu_dirpage *dp;
+ struct lu_dirent *ent;
+
+ if (!IS_ERR(page)) {
+ /*
+ * If page is empty (end of directory is reached),
+ * use this value.
+ */
+ __u64 hash = MDS_DIR_END_OFF;
+ __u64 next;
+
+ dp = page_address(page);
+ for (ent = lu_dirent_start(dp); ent != NULL && !done;
+ ent = lu_dirent_next(ent)) {
+ __u16 type;
+ int namelen;
+ struct lu_fid fid;
+ __u64 lhash;
+ __u64 ino;
+
+ /*
+ * XXX: implement correct swabbing here.
+ */
+
+ hash = le64_to_cpu(ent->lde_hash);
+ if (hash < pos)
+ /*
+ * Skip until we find target hash
+ * value.
+ */
+ continue;
+
+ namelen = le16_to_cpu(ent->lde_namelen);
+ if (namelen == 0)
+ /*
+ * Skip dummy record.
+ */
+ continue;
+
+ if (api32 && hash64)
+ lhash = hash >> 32;
+ else
+ lhash = hash;
+ fid_le_to_cpu(&fid, &ent->lde_fid);
+ ino = cl_fid_build_ino(&fid, api32);
+ type = ll_dirent_type_get(ent);
+ ctx->pos = lhash;
+ /* For 'll_nfs_get_name_filldir()', it will try
+ * to access the 'ent' through its 'lde_name',
+ * so the parameter 'name' for 'ctx->actor()'
+ * must be part of the 'ent'.
+ */
+ done = !dir_emit(ctx, ent->lde_name,
+ namelen, ino, type);
+ }
+ next = le64_to_cpu(dp->ldp_hash_end);
+ if (!done) {
+ pos = next;
+ if (pos == MDS_DIR_END_OFF) {
+ /*
+ * End of directory reached.
+ */
+ done = 1;
+ ll_release_page(page, 0);
+ } else if (1 /* chain is exhausted*/) {
+ /*
+ * Normal case: continue to the next
+ * page.
+ */
+ ll_release_page(page,
+ le32_to_cpu(dp->ldp_flags) &
+ LDF_COLLIDE);
+ next = pos;
+ page = ll_get_dir_page(inode, pos,
+ &chain);
+ } else {
+ /*
+ * go into overflow page.
+ */
+ LASSERT(le32_to_cpu(dp->ldp_flags) &
+ LDF_COLLIDE);
+ ll_release_page(page, 1);
+ }
+ } else {
+ pos = hash;
+ ll_release_page(page, 0);
+ }
+ } else {
+ rc = PTR_ERR(page);
+ CERROR("error reading dir "DFID" at %lu: rc %d\n",
+ PFID(&info->lli_fid), (unsigned long)pos, rc);
+ }
+ }
+
+ ctx->pos = pos;
+ ll_dir_chain_fini(&chain);
+ return rc;
+}
+
+static int ll_readdir(struct file *filp, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(filp);
+ struct ll_file_data *lfd = LUSTRE_FPRIVATE(filp);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH;
+ int api32 = ll_need_32bit_api(sbi);
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api %d\n",
+ inode->i_ino, inode->i_generation,
+ inode, (unsigned long)lfd->lfd_pos, i_size_read(inode), api32);
+
+ if (lfd->lfd_pos == MDS_DIR_END_OFF) {
+ /*
+ * end-of-file.
+ */
+ rc = 0;
+ goto out;
+ }
+
+ ctx->pos = lfd->lfd_pos;
+ rc = ll_dir_read(inode, ctx);
+ lfd->lfd_pos = ctx->pos;
+ if (ctx->pos == MDS_DIR_END_OFF) {
+ if (api32)
+ ctx->pos = LL_DIR_END_OFF_32BIT;
+ else
+ ctx->pos = LL_DIR_END_OFF;
+ } else {
+ if (api32 && hash64)
+ ctx->pos >>= 32;
+ }
+ filp->f_version = inode->i_version;
+
+out:
+ if (!rc)
+ ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
+
+ return rc;
+}
+
+static int ll_send_mgc_param(struct obd_export *mgc, char *string)
+{
+ struct mgs_send_param *msp;
+ int rc = 0;
+
+ msp = kzalloc(sizeof(*msp), GFP_NOFS);
+ if (!msp)
+ return -ENOMEM;
+
+ strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
+ rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
+ sizeof(struct mgs_send_param), msp, NULL);
+ if (rc)
+ CERROR("Failed to set parameter: %d\n", rc);
+ OBD_FREE_PTR(msp);
+
+ return rc;
+}
+
+static int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump,
+ char *filename)
+{
+ struct ptlrpc_request *request = NULL;
+ struct md_op_data *op_data;
+ struct ll_sb_info *sbi = ll_i2sbi(dir);
+ int mode;
+ int err;
+
+ mode = (0755 & ~current_umask()) | S_IFDIR;
+ op_data = ll_prep_md_op_data(NULL, dir, NULL, filename,
+ strlen(filename), mode, LUSTRE_OPC_MKDIR,
+ lump);
+ if (IS_ERR(op_data)) {
+ err = PTR_ERR(op_data);
+ goto err_exit;
+ }
+
+ op_data->op_cli_flags |= CLI_SET_MEA;
+ err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+ from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns, current_fsgid()),
+ cfs_curproc_cap_pack(), 0, &request);
+ ll_finish_md_op_data(op_data);
+ if (err)
+ goto err_exit;
+err_exit:
+ ptlrpc_req_finished(request);
+ return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+ int set_default)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct md_op_data *op_data;
+ struct ptlrpc_request *req = NULL;
+ int rc = 0;
+ struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+ struct obd_device *mgc = lsi->lsi_mgc;
+ int lum_size;
+
+ if (lump != NULL) {
+ /*
+ * This is coming from userspace, so should be in
+ * local endian. But the MDS would like it in little
+ * endian, so we swab it before we send it.
+ */
+ switch (lump->lmm_magic) {
+ case LOV_USER_MAGIC_V1: {
+ if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+ lustre_swab_lov_user_md_v1(lump);
+ lum_size = sizeof(struct lov_user_md_v1);
+ break;
+ }
+ case LOV_USER_MAGIC_V3: {
+ if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+ lustre_swab_lov_user_md_v3(
+ (struct lov_user_md_v3 *)lump);
+ lum_size = sizeof(struct lov_user_md_v3);
+ break;
+ }
+ default: {
+ CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+ lump->lmm_magic, LOV_USER_MAGIC_V1,
+ LOV_USER_MAGIC_V3);
+ return -EINVAL;
+ }
+ }
+ } else {
+ lum_size = sizeof(struct lov_user_md_v1);
+ }
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC))
+ op_data->op_cli_flags |= CLI_SET_MEA;
+
+ /* swabbing is done in lov_setstripe() on server side */
+ rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
+ NULL, 0, &req, NULL);
+ ll_finish_md_op_data(op_data);
+ ptlrpc_req_finished(req);
+ if (rc) {
+ if (rc != -EPERM && rc != -EACCES)
+ CERROR("mdc_setattr fails: rc = %d\n", rc);
+ }
+
+ /* In the following we use the fact that LOV_USER_MAGIC_V1 and
+ LOV_USER_MAGIC_V3 have the same initial fields so we do not
+ need to make the distinction between the 2 versions */
+ if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
+ char *param = NULL;
+ char *buf;
+
+ param = kzalloc(MGS_PARAM_MAXLEN, GFP_NOFS);
+ if (!param) {
+ rc = -ENOMEM;
+ goto end;
+ }
+
+ buf = param;
+ /* Get fsname and assume devname to be -MDT0000. */
+ ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
+ strcat(buf, "-MDT0000.lov");
+ buf += strlen(buf);
+
+ /* Set root stripesize */
+ sprintf(buf, ".stripesize=%u",
+ lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
+ rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+ if (rc)
+ goto end;
+
+ /* Set root stripecount */
+ sprintf(buf, ".stripecount=%hd",
+ lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
+ rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+ if (rc)
+ goto end;
+
+ /* Set root stripeoffset */
+ sprintf(buf, ".stripeoffset=%hd",
+ lump ? le16_to_cpu(lump->lmm_stripe_offset) :
+ (typeof(lump->lmm_stripe_offset))(-1));
+ rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+
+end:
+ if (param != NULL)
+ OBD_FREE(param, MGS_PARAM_MAXLEN);
+ }
+ return rc;
+}
+
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+ int *lmm_size, struct ptlrpc_request **request)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct mdt_body *body;
+ struct lov_mds_md *lmm = NULL;
+ struct ptlrpc_request *req = NULL;
+ int rc, lmmsize;
+ struct md_op_data *op_data;
+
+ rc = ll_get_default_mdsize(sbi, &lmmsize);
+ if (rc)
+ return rc;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+ 0, lmmsize, LUSTRE_OPC_ANY,
+ NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+ rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+ ll_finish_md_op_data(op_data);
+ if (rc < 0) {
+ CDEBUG(D_INFO, "md_getattr failed on inode %lu/%u: rc %d\n",
+ inode->i_ino,
+ inode->i_generation, rc);
+ goto out;
+ }
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body != NULL);
+
+ lmmsize = body->eadatasize;
+
+ if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+ lmmsize == 0) {
+ rc = -ENODATA;
+ goto out;
+ }
+
+ lmm = req_capsule_server_sized_get(&req->rq_pill,
+ &RMF_MDT_MD, lmmsize);
+ LASSERT(lmm != NULL);
+
+ /*
+ * This is coming from the MDS, so is probably in
+ * little endian. We convert it to host endian before
+ * passing it to userspace.
+ */
+ /* We don't swab objects for directories */
+ switch (le32_to_cpu(lmm->lmm_magic)) {
+ case LOV_MAGIC_V1:
+ if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+ lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+ break;
+ case LOV_MAGIC_V3:
+ if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+ lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+ break;
+ default:
+ CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+ rc = -EPROTO;
+ }
+out:
+ *lmmp = lmm;
+ *lmm_size = lmmsize;
+ *request = req;
+ return rc;
+}
+
+/*
+ * Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct md_op_data *op_data;
+ int rc, mdtidx;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0,
+ 0, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ op_data->op_flags |= MF_GET_MDT_IDX;
+ rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+ mdtidx = op_data->op_mds;
+ ll_finish_md_op_data(op_data);
+ if (rc < 0) {
+ CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+ return rc;
+ }
+ return mdtidx;
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It send a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct hsm_progress_kernel hpk;
+ int rc;
+
+ /* Forge a hsm_progress based on data from copy. */
+ hpk.hpk_fid = copy->hc_hai.hai_fid;
+ hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+ hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+ hpk.hpk_extent.length = 0;
+ hpk.hpk_flags = 0;
+ hpk.hpk_errval = 0;
+ hpk.hpk_data_version = 0;
+
+
+ /* For archive request, we need to read the current file version. */
+ if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+ struct inode *inode;
+ __u64 data_version = 0;
+
+ /* Get inode for this fid */
+ inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+ if (IS_ERR(inode)) {
+ hpk.hpk_flags |= HP_FLAG_RETRY;
+ /* hpk_errval is >= 0 */
+ hpk.hpk_errval = -PTR_ERR(inode);
+ rc = PTR_ERR(inode);
+ goto progress;
+ }
+
+ /* Read current file data version */
+ rc = ll_data_version(inode, &data_version, 1);
+ iput(inode);
+ if (rc != 0) {
+ CDEBUG(D_HSM, "Could not read file data version of "
+ DFID" (rc = %d). Archive request (%#llx) could not be done.\n",
+ PFID(&copy->hc_hai.hai_fid), rc,
+ copy->hc_hai.hai_cookie);
+ hpk.hpk_flags |= HP_FLAG_RETRY;
+ /* hpk_errval must be >= 0 */
+ hpk.hpk_errval = -rc;
+ goto progress;
+ }
+
+ /* Store it the hsm_copy for later copytool use.
+ * Always modified even if no lsm. */
+ copy->hc_data_version = data_version;
+ }
+
+progress:
+ rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+ &hpk, NULL);
+
+ return rc;
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ * with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ * will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ * coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct hsm_progress_kernel hpk;
+ int rc;
+
+ /* If you modify the logic here, also check llapi_hsm_copy_end(). */
+ /* Take care: copy->hc_hai.hai_action, len, gid and data are not
+ * initialized if copy_end was called with copy == NULL.
+ */
+
+ /* Forge a hsm_progress based on data from copy. */
+ hpk.hpk_fid = copy->hc_hai.hai_fid;
+ hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+ hpk.hpk_extent = copy->hc_hai.hai_extent;
+ hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+ hpk.hpk_errval = copy->hc_errval;
+ hpk.hpk_data_version = 0;
+
+ /* For archive request, we need to check the file data was not changed.
+ *
+ * For restore request, we need to send the file data version, this is
+ * useful when the file was created using hsm_import.
+ */
+ if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+ (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+ (copy->hc_errval == 0)) {
+ struct inode *inode;
+ __u64 data_version = 0;
+
+ /* Get lsm for this fid */
+ inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+ if (IS_ERR(inode)) {
+ hpk.hpk_flags |= HP_FLAG_RETRY;
+ /* hpk_errval must be >= 0 */
+ hpk.hpk_errval = -PTR_ERR(inode);
+ rc = PTR_ERR(inode);
+ goto progress;
+ }
+
+ rc = ll_data_version(inode, &data_version,
+ copy->hc_hai.hai_action == HSMA_ARCHIVE);
+ iput(inode);
+ if (rc) {
+ CDEBUG(D_HSM, "Could not read file data version. Request could not be confirmed.\n");
+ if (hpk.hpk_errval == 0)
+ hpk.hpk_errval = -rc;
+ goto progress;
+ }
+
+ /* Store it the hsm_copy for later copytool use.
+ * Always modified even if no lsm. */
+ hpk.hpk_data_version = data_version;
+
+ /* File could have been stripped during archiving, so we need
+ * to check anyway. */
+ if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+ (copy->hc_data_version != data_version)) {
+ CDEBUG(D_HSM, "File data version mismatched. File content was changed during archiving. "
+ DFID", start:%#llx current:%#llx\n",
+ PFID(&copy->hc_hai.hai_fid),
+ copy->hc_data_version, data_version);
+ /* File was changed, send error to cdt. Do not ask for
+ * retry because if a file is modified frequently,
+ * the cdt will loop on retried archive requests.
+ * The policy engine will ask for a new archive later
+ * when the file will not be modified for some tunable
+ * time */
+ /* we do not notify caller */
+ hpk.hpk_flags &= ~HP_FLAG_RETRY;
+ /* hpk_errval must be >= 0 */
+ hpk.hpk_errval = EBUSY;
+ }
+
+ }
+
+progress:
+ rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+ &hpk, NULL);
+
+ return rc;
+}
+
+
+static int copy_and_ioctl(int cmd, struct obd_export *exp,
+ const void __user *data, size_t size)
+{
+ void *copy;
+ int rc;
+
+ copy = kzalloc(size, GFP_NOFS);
+ if (!copy)
+ return -ENOMEM;
+
+ if (copy_from_user(copy, data, size)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ rc = obd_iocontrol(cmd, exp, size, copy, NULL);
+out:
+ OBD_FREE(copy, size);
+
+ return rc;
+}
+
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+{
+ int cmd = qctl->qc_cmd;
+ int type = qctl->qc_type;
+ int id = qctl->qc_id;
+ int valid = qctl->qc_valid;
+ int rc = 0;
+
+ switch (cmd) {
+ case LUSTRE_Q_INVALIDATE:
+ case LUSTRE_Q_FINVALIDATE:
+ case Q_QUOTAON:
+ case Q_QUOTAOFF:
+ case Q_SETQUOTA:
+ case Q_SETINFO:
+ if (!capable(CFS_CAP_SYS_ADMIN) ||
+ sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ return -EPERM;
+ break;
+ case Q_GETQUOTA:
+ if (((type == USRQUOTA &&
+ !uid_eq(current_euid(), make_kuid(&init_user_ns, id))) ||
+ (type == GRPQUOTA &&
+ !in_egroup_p(make_kgid(&init_user_ns, id)))) &&
+ (!capable(CFS_CAP_SYS_ADMIN) ||
+ sbi->ll_flags & LL_SBI_RMT_CLIENT))
+ return -EPERM;
+ break;
+ case Q_GETINFO:
+ break;
+ default:
+ CERROR("unsupported quotactl op: %#x\n", cmd);
+ return -ENOTTY;
+ }
+
+ if (valid != QC_GENERAL) {
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ return -EOPNOTSUPP;
+
+ if (cmd == Q_GETINFO)
+ qctl->qc_cmd = Q_GETOINFO;
+ else if (cmd == Q_GETQUOTA)
+ qctl->qc_cmd = Q_GETOQUOTA;
+ else
+ return -EINVAL;
+
+ switch (valid) {
+ case QC_MDTIDX:
+ rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+ sizeof(*qctl), qctl, NULL);
+ break;
+ case QC_OSTIDX:
+ rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+ sizeof(*qctl), qctl, NULL);
+ break;
+ case QC_UUID:
+ rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+ sizeof(*qctl), qctl, NULL);
+ if (rc == -EAGAIN)
+ rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+ sbi->ll_dt_exp,
+ sizeof(*qctl), qctl, NULL);
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ if (rc)
+ return rc;
+
+ qctl->qc_cmd = cmd;
+ } else {
+ struct obd_quotactl *oqctl;
+
+ oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
+ if (!oqctl)
+ return -ENOMEM;
+
+ QCTL_COPY(oqctl, qctl);
+ rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+ if (rc) {
+ if (rc != -EALREADY && cmd == Q_QUOTAON) {
+ oqctl->qc_cmd = Q_QUOTAOFF;
+ obd_quotactl(sbi->ll_md_exp, oqctl);
+ }
+ OBD_FREE_PTR(oqctl);
+ return rc;
+ }
+ /* If QIF_SPACE is not set, client should collect the
+ * space usage from OSSs by itself */
+ if (cmd == Q_GETQUOTA &&
+ !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+ !oqctl->qc_dqblk.dqb_curspace) {
+ struct obd_quotactl *oqctl_tmp;
+
+ oqctl_tmp = kzalloc(sizeof(*oqctl_tmp), GFP_NOFS);
+ if (!oqctl_tmp) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+ oqctl_tmp->qc_id = oqctl->qc_id;
+ oqctl_tmp->qc_type = oqctl->qc_type;
+
+ /* collect space usage from OSTs */
+ oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+ rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+ if (!rc || rc == -EREMOTEIO) {
+ oqctl->qc_dqblk.dqb_curspace =
+ oqctl_tmp->qc_dqblk.dqb_curspace;
+ oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+ }
+
+ /* collect space & inode usage from MDTs */
+ oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+ oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+ rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+ if (!rc || rc == -EREMOTEIO) {
+ oqctl->qc_dqblk.dqb_curspace +=
+ oqctl_tmp->qc_dqblk.dqb_curspace;
+ oqctl->qc_dqblk.dqb_curinodes =
+ oqctl_tmp->qc_dqblk.dqb_curinodes;
+ oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+ } else {
+ oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+ }
+
+ OBD_FREE_PTR(oqctl_tmp);
+ }
+out:
+ QCTL_COPY(qctl, oqctl);
+ OBD_FREE_PTR(oqctl);
+ }
+
+ return rc;
+}
+
+static char *
+ll_getname(const char __user *filename)
+{
+ int ret = 0, len;
+ char *tmp = __getname();
+
+ if (!tmp)
+ return ERR_PTR(-ENOMEM);
+
+ len = strncpy_from_user(tmp, filename, PATH_MAX);
+ if (len == 0)
+ ret = -ENOENT;
+ else if (len > PATH_MAX)
+ ret = -ENAMETOOLONG;
+
+ if (ret) {
+ __putname(tmp);
+ tmp = ERR_PTR(ret);
+ }
+ return tmp;
+}
+
+#define ll_putname(filename) __putname(filename)
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct obd_ioctl_data *data;
+ int rc = 0;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
+ inode->i_ino, inode->i_generation, inode, cmd);
+
+ /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+ if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+ return -ENOTTY;
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+ switch (cmd) {
+ case FSFILT_IOC_GETFLAGS:
+ case FSFILT_IOC_SETFLAGS:
+ return ll_iocontrol(inode, file, cmd, arg);
+ case FSFILT_IOC_GETVERSION_OLD:
+ case FSFILT_IOC_GETVERSION:
+ return put_user(inode->i_generation, (int *)arg);
+ /* We need to special case any other ioctls we want to handle,
+ * to send them to the MDS/OST as appropriate and to properly
+ * network encode the arg field.
+ case FSFILT_IOC_SETVERSION_OLD:
+ case FSFILT_IOC_SETVERSION:
+ */
+ case LL_IOC_GET_MDTIDX: {
+ int mdtidx;
+
+ mdtidx = ll_get_mdt_idx(inode);
+ if (mdtidx < 0)
+ return mdtidx;
+
+ if (put_user((int)mdtidx, (int *)arg))
+ return -EFAULT;
+
+ return 0;
+ }
+ case IOC_MDC_LOOKUP: {
+ struct ptlrpc_request *request = NULL;
+ int namelen, len = 0;
+ char *buf = NULL;
+ char *filename;
+ struct md_op_data *op_data;
+
+ rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+ if (rc)
+ return rc;
+ data = (void *)buf;
+
+ filename = data->ioc_inlbuf1;
+ namelen = strlen(filename);
+
+ if (namelen < 1) {
+ CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+ rc = -EINVAL;
+ goto out_free;
+ }
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen,
+ 0, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data)) {
+ rc = PTR_ERR(op_data);
+ goto out_free;
+ }
+
+ op_data->op_valid = OBD_MD_FLID;
+ rc = md_getattr_name(sbi->ll_md_exp, op_data, &request);
+ ll_finish_md_op_data(op_data);
+ if (rc < 0) {
+ CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+ goto out_free;
+ }
+ ptlrpc_req_finished(request);
+out_free:
+ obd_ioctl_freedata(buf, len);
+ return rc;
+ }
+ case LL_IOC_LMV_SETSTRIPE: {
+ struct lmv_user_md *lum;
+ char *buf = NULL;
+ char *filename;
+ int namelen = 0;
+ int lumlen = 0;
+ int len;
+ int rc;
+
+ rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+ if (rc)
+ return rc;
+
+ data = (void *)buf;
+ if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+ data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) {
+ rc = -EINVAL;
+ goto lmv_out_free;
+ }
+
+ filename = data->ioc_inlbuf1;
+ namelen = data->ioc_inllen1;
+
+ if (namelen < 1) {
+ CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+ rc = -EINVAL;
+ goto lmv_out_free;
+ }
+ lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+ lumlen = data->ioc_inllen2;
+
+ if (lum->lum_magic != LMV_USER_MAGIC ||
+ lumlen != sizeof(*lum)) {
+ CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+ filename, lum->lum_magic, lumlen, -EFAULT);
+ rc = -EINVAL;
+ goto lmv_out_free;
+ }
+
+ /**
+ * ll_dir_setdirstripe will be used to set dir stripe
+ * mdc_create--->mdt_reint_create (with dirstripe)
+ */
+ rc = ll_dir_setdirstripe(inode, lum, filename);
+lmv_out_free:
+ obd_ioctl_freedata(buf, len);
+ return rc;
+
+ }
+ case LL_IOC_LOV_SETSTRIPE: {
+ struct lov_user_md_v3 lumv3;
+ struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+ struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+ struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
+ int set_default = 0;
+
+ LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+ LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+ sizeof(lumv3p->lmm_objects[0]));
+ /* first try with v1 which is smaller than v3 */
+ if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+ return -EFAULT;
+
+ if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+ if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+ return -EFAULT;
+ }
+
+ if (is_root_inode(inode))
+ set_default = 1;
+
+ /* in v1 and v3 cases lumv1 points to data */
+ rc = ll_dir_setstripe(inode, lumv1, set_default);
+
+ return rc;
+ }
+ case LL_IOC_LMV_GETSTRIPE: {
+ struct lmv_user_md *lump = (struct lmv_user_md *)arg;
+ struct lmv_user_md lum;
+ struct lmv_user_md *tmp;
+ int lum_size;
+ int rc = 0;
+ int mdtindex;
+
+ if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md)))
+ return -EFAULT;
+
+ if (lum.lum_magic != LMV_MAGIC_V1)
+ return -EINVAL;
+
+ lum_size = lmv_user_md_size(1, LMV_MAGIC_V1);
+ tmp = kzalloc(lum_size, GFP_NOFS);
+ if (!tmp) {
+ rc = -ENOMEM;
+ goto free_lmv;
+ }
+
+ *tmp = lum;
+ tmp->lum_type = LMV_STRIPE_TYPE;
+ tmp->lum_stripe_count = 1;
+ mdtindex = ll_get_mdt_idx(inode);
+ if (mdtindex < 0) {
+ rc = -ENOMEM;
+ goto free_lmv;
+ }
+
+ tmp->lum_stripe_offset = mdtindex;
+ tmp->lum_objects[0].lum_mds = mdtindex;
+ memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode),
+ sizeof(struct lu_fid));
+ if (copy_to_user((void *)arg, tmp, lum_size)) {
+ rc = -EFAULT;
+ goto free_lmv;
+ }
+free_lmv:
+ if (tmp)
+ OBD_FREE(tmp, lum_size);
+ return rc;
+ }
+ case LL_IOC_REMOVE_ENTRY: {
+ char *filename = NULL;
+ int namelen = 0;
+ int rc;
+
+ /* Here is a little hack to avoid sending REINT_RMENTRY to
+ * unsupported server, which might crash the server(LU-2730),
+ * Because both LVB_TYPE and REINT_RMENTRY will be supported
+ * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+ * server will support REINT_RMENTRY XXX*/
+ if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+ return -ENOTSUPP;
+
+ filename = ll_getname((const char *)arg);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
+
+ namelen = strlen(filename);
+ if (namelen < 1) {
+ rc = -EINVAL;
+ goto out_rmdir;
+ }
+
+ rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+ if (filename)
+ ll_putname(filename);
+ return rc;
+ }
+ case LL_IOC_LOV_SWAP_LAYOUTS:
+ return -EPERM;
+ case LL_IOC_OBD_STATFS:
+ return ll_obd_statfs(inode, (void *)arg);
+ case LL_IOC_LOV_GETSTRIPE:
+ case LL_IOC_MDC_GETINFO:
+ case IOC_MDC_GETFILEINFO:
+ case IOC_MDC_GETFILESTRIPE: {
+ struct ptlrpc_request *request = NULL;
+ struct lov_user_md *lump;
+ struct lov_mds_md *lmm = NULL;
+ struct mdt_body *body;
+ char *filename = NULL;
+ int lmmsize;
+
+ if (cmd == IOC_MDC_GETFILEINFO ||
+ cmd == IOC_MDC_GETFILESTRIPE) {
+ filename = ll_getname((const char *)arg);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
+
+ rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+ &lmmsize, &request);
+ } else {
+ rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+ }
+
+ if (request) {
+ body = req_capsule_server_get(&request->rq_pill,
+ &RMF_MDT_BODY);
+ LASSERT(body != NULL);
+ } else {
+ goto out_req;
+ }
+
+ if (rc < 0) {
+ if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+ cmd == LL_IOC_MDC_GETINFO)) {
+ rc = 0;
+ goto skip_lmm;
+ } else
+ goto out_req;
+ }
+
+ if (cmd == IOC_MDC_GETFILESTRIPE ||
+ cmd == LL_IOC_LOV_GETSTRIPE) {
+ lump = (struct lov_user_md *)arg;
+ } else {
+ struct lov_user_mds_data *lmdp;
+
+ lmdp = (struct lov_user_mds_data *)arg;
+ lump = &lmdp->lmd_lmm;
+ }
+ if (copy_to_user(lump, lmm, lmmsize)) {
+ if (copy_to_user(lump, lmm, sizeof(*lump))) {
+ rc = -EFAULT;
+ goto out_req;
+ }
+ rc = -EOVERFLOW;
+ }
+skip_lmm:
+ if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+ struct lov_user_mds_data *lmdp;
+ lstat_t st = { 0 };
+
+ st.st_dev = inode->i_sb->s_dev;
+ st.st_mode = body->mode;
+ st.st_nlink = body->nlink;
+ st.st_uid = body->uid;
+ st.st_gid = body->gid;
+ st.st_rdev = body->rdev;
+ st.st_size = body->size;
+ st.st_blksize = PAGE_CACHE_SIZE;
+ st.st_blocks = body->blocks;
+ st.st_atime = body->atime;
+ st.st_mtime = body->mtime;
+ st.st_ctime = body->ctime;
+ st.st_ino = inode->i_ino;
+
+ lmdp = (struct lov_user_mds_data *)arg;
+ if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) {
+ rc = -EFAULT;
+ goto out_req;
+ }
+ }
+
+out_req:
+ ptlrpc_req_finished(request);
+ if (filename)
+ ll_putname(filename);
+ return rc;
+ }
+ case IOC_LOV_GETINFO: {
+ struct lov_user_mds_data *lumd;
+ struct lov_stripe_md *lsm;
+ struct lov_user_md *lum;
+ struct lov_mds_md *lmm;
+ int lmmsize;
+ lstat_t st;
+
+ lumd = (struct lov_user_mds_data *)arg;
+ lum = &lumd->lmd_lmm;
+
+ rc = ll_get_max_mdsize(sbi, &lmmsize);
+ if (rc)
+ return rc;
+
+ OBD_ALLOC_LARGE(lmm, lmmsize);
+ if (lmm == NULL)
+ return -ENOMEM;
+ if (copy_from_user(lmm, lum, lmmsize)) {
+ rc = -EFAULT;
+ goto free_lmm;
+ }
+
+ switch (lmm->lmm_magic) {
+ case LOV_USER_MAGIC_V1:
+ if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
+ break;
+ /* swab objects first so that stripes num will be sane */
+ lustre_swab_lov_user_md_objects(
+ ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+ ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+ lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+ break;
+ case LOV_USER_MAGIC_V3:
+ if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
+ break;
+ /* swab objects first so that stripes num will be sane */
+ lustre_swab_lov_user_md_objects(
+ ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+ ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+ lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+ break;
+ default:
+ rc = -EINVAL;
+ goto free_lmm;
+ }
+
+ rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
+ if (rc < 0) {
+ rc = -ENOMEM;
+ goto free_lmm;
+ }
+
+ /* Perform glimpse_size operation. */
+ memset(&st, 0, sizeof(st));
+
+ rc = ll_glimpse_ioctl(sbi, lsm, &st);
+ if (rc)
+ goto free_lsm;
+
+ if (copy_to_user(&lumd->lmd_st, &st, sizeof(st))) {
+ rc = -EFAULT;
+ goto free_lsm;
+ }
+
+free_lsm:
+ obd_free_memmd(sbi->ll_dt_exp, &lsm);
+free_lmm:
+ OBD_FREE_LARGE(lmm, lmmsize);
+ return rc;
+ }
+ case OBD_IOC_LLOG_CATINFO: {
+ return -EOPNOTSUPP;
+ }
+ case OBD_IOC_QUOTACHECK: {
+ struct obd_quotactl *oqctl;
+ int error = 0;
+
+ if (!capable(CFS_CAP_SYS_ADMIN) ||
+ sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ return -EPERM;
+
+ oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
+ if (!oqctl)
+ return -ENOMEM;
+ oqctl->qc_type = arg;
+ rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
+ if (rc < 0) {
+ CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
+ error = rc;
+ }
+
+ rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
+ if (rc < 0)
+ CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
+
+ OBD_FREE_PTR(oqctl);
+ return error ?: rc;
+ }
+ case OBD_IOC_POLL_QUOTACHECK: {
+ struct if_quotacheck *check;
+
+ if (!capable(CFS_CAP_SYS_ADMIN) ||
+ sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ return -EPERM;
+
+ check = kzalloc(sizeof(*check), GFP_NOFS);
+ if (!check)
+ return -ENOMEM;
+
+ rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
+ NULL);
+ if (rc) {
+ CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
+ if (copy_to_user((void *)arg, check,
+ sizeof(*check)))
+ CDEBUG(D_QUOTA, "copy_to_user failed\n");
+ goto out_poll;
+ }
+
+ rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
+ NULL);
+ if (rc) {
+ CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
+ if (copy_to_user((void *)arg, check,
+ sizeof(*check)))
+ CDEBUG(D_QUOTA, "copy_to_user failed\n");
+ goto out_poll;
+ }
+out_poll:
+ OBD_FREE_PTR(check);
+ return rc;
+ }
+ case LL_IOC_QUOTACTL: {
+ struct if_quotactl *qctl;
+
+ qctl = kzalloc(sizeof(*qctl), GFP_NOFS);
+ if (!qctl)
+ return -ENOMEM;
+
+ if (copy_from_user(qctl, (void *)arg, sizeof(*qctl))) {
+ rc = -EFAULT;
+ goto out_quotactl;
+ }
+
+ rc = quotactl_ioctl(sbi, qctl);
+
+ if (rc == 0 && copy_to_user((void *)arg, qctl, sizeof(*qctl)))
+ rc = -EFAULT;
+
+out_quotactl:
+ OBD_FREE_PTR(qctl);
+ return rc;
+ }
+ case OBD_IOC_GETDTNAME:
+ case OBD_IOC_GETMDNAME:
+ return ll_get_obd_name(inode, cmd, arg);
+ case LL_IOC_FLUSHCTX:
+ return ll_flush_ctx(inode);
+#ifdef CONFIG_FS_POSIX_ACL
+ case LL_IOC_RMTACL: {
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+ LASSERT(fd != NULL);
+ rc = rct_add(&sbi->ll_rct, current_pid(), arg);
+ if (!rc)
+ fd->fd_flags |= LL_FILE_RMTACL;
+ return rc;
+ } else
+ return 0;
+ }
+#endif
+ case LL_IOC_GETOBDCOUNT: {
+ int count, vallen;
+ struct obd_export *exp;
+
+ if (copy_from_user(&count, (int *)arg, sizeof(int)))
+ return -EFAULT;
+
+ /* get ost count when count is zero, get mdt count otherwise */
+ exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+ vallen = sizeof(count);
+ rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+ KEY_TGT_COUNT, &vallen, &count, NULL);
+ if (rc) {
+ CERROR("get target count failed: %d\n", rc);
+ return rc;
+ }
+
+ if (copy_to_user((int *)arg, &count, sizeof(int)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case LL_IOC_PATH2FID:
+ if (copy_to_user((void *)arg, ll_inode2fid(inode),
+ sizeof(struct lu_fid)))
+ return -EFAULT;
+ return 0;
+ case LL_IOC_GET_CONNECT_FLAGS: {
+ return obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void *)arg);
+ }
+ case OBD_IOC_CHANGELOG_SEND:
+ case OBD_IOC_CHANGELOG_CLEAR:
+ rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+ sizeof(struct ioc_changelog));
+ return rc;
+ case OBD_IOC_FID2PATH:
+ return ll_fid2path(inode, (void *)arg);
+ case LL_IOC_HSM_REQUEST: {
+ struct hsm_user_request *hur;
+ ssize_t totalsize;
+
+ hur = kzalloc(sizeof(*hur), GFP_NOFS);
+ if (!hur)
+ return -ENOMEM;
+
+ /* We don't know the true size yet; copy the fixed-size part */
+ if (copy_from_user(hur, (void *)arg, sizeof(*hur))) {
+ OBD_FREE_PTR(hur);
+ return -EFAULT;
+ }
+
+ /* Compute the whole struct size */
+ totalsize = hur_len(hur);
+ OBD_FREE_PTR(hur);
+ if (totalsize < 0)
+ return -E2BIG;
+
+ /* Final size will be more than double totalsize */
+ if (totalsize >= MDS_MAXREQSIZE / 3)
+ return -E2BIG;
+
+ OBD_ALLOC_LARGE(hur, totalsize);
+ if (hur == NULL)
+ return -ENOMEM;
+
+ /* Copy the whole struct */
+ if (copy_from_user(hur, (void *)arg, totalsize)) {
+ OBD_FREE_LARGE(hur, totalsize);
+ return -EFAULT;
+ }
+
+ if (hur->hur_request.hr_action == HUA_RELEASE) {
+ const struct lu_fid *fid;
+ struct inode *f;
+ int i;
+
+ for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+ fid = &hur->hur_user_item[i].hui_fid;
+ f = search_inode_for_lustre(inode->i_sb, fid);
+ if (IS_ERR(f)) {
+ rc = PTR_ERR(f);
+ break;
+ }
+
+ rc = ll_hsm_release(f);
+ iput(f);
+ if (rc != 0)
+ break;
+ }
+ } else {
+ rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+ hur, NULL);
+ }
+
+ OBD_FREE_LARGE(hur, totalsize);
+
+ return rc;
+ }
+ case LL_IOC_HSM_PROGRESS: {
+ struct hsm_progress_kernel hpk;
+ struct hsm_progress hp;
+
+ if (copy_from_user(&hp, (void *)arg, sizeof(hp)))
+ return -EFAULT;
+
+ hpk.hpk_fid = hp.hp_fid;
+ hpk.hpk_cookie = hp.hp_cookie;
+ hpk.hpk_extent = hp.hp_extent;
+ hpk.hpk_flags = hp.hp_flags;
+ hpk.hpk_errval = hp.hp_errval;
+ hpk.hpk_data_version = 0;
+
+ /* File may not exist in Lustre; all progress
+ * reported to Lustre root */
+ rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+ NULL);
+ return rc;
+ }
+ case LL_IOC_HSM_CT_START:
+ rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+ sizeof(struct lustre_kernelcomm));
+ return rc;
+
+ case LL_IOC_HSM_COPY_START: {
+ struct hsm_copy *copy;
+ int rc;
+
+ copy = kzalloc(sizeof(*copy), GFP_NOFS);
+ if (!copy)
+ return -ENOMEM;
+ if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+ OBD_FREE_PTR(copy);
+ return -EFAULT;
+ }
+
+ rc = ll_ioc_copy_start(inode->i_sb, copy);
+ if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+ rc = -EFAULT;
+
+ OBD_FREE_PTR(copy);
+ return rc;
+ }
+ case LL_IOC_HSM_COPY_END: {
+ struct hsm_copy *copy;
+ int rc;
+
+ copy = kzalloc(sizeof(*copy), GFP_NOFS);
+ if (!copy)
+ return -ENOMEM;
+ if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+ OBD_FREE_PTR(copy);
+ return -EFAULT;
+ }
+
+ rc = ll_ioc_copy_end(inode->i_sb, copy);
+ if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+ rc = -EFAULT;
+
+ OBD_FREE_PTR(copy);
+ return rc;
+ }
+ default:
+ return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, (void *)arg);
+ }
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ int api32 = ll_need_32bit_api(sbi);
+ loff_t ret = -EINVAL;
+
+ mutex_lock(&inode->i_mutex);
+ switch (origin) {
+ case SEEK_SET:
+ break;
+ case SEEK_CUR:
+ offset += file->f_pos;
+ break;
+ case SEEK_END:
+ if (offset > 0)
+ goto out;
+ if (api32)
+ offset += LL_DIR_END_OFF_32BIT;
+ else
+ offset += LL_DIR_END_OFF;
+ break;
+ default:
+ goto out;
+ }
+
+ if (offset >= 0 &&
+ ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+ (!api32 && offset <= LL_DIR_END_OFF))) {
+ if (offset != file->f_pos) {
+ if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+ (!api32 && offset == LL_DIR_END_OFF))
+ fd->lfd_pos = MDS_DIR_END_OFF;
+ else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
+ fd->lfd_pos = offset << 32;
+ else
+ fd->lfd_pos = offset;
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+ ret = offset;
+ }
+ goto out;
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+static int ll_dir_open(struct inode *inode, struct file *file)
+{
+ return ll_file_open(inode, file);
+}
+
+static int ll_dir_release(struct inode *inode, struct file *file)
+{
+ return ll_file_release(inode, file);
+}
+
+const struct file_operations ll_dir_operations = {
+ .llseek = ll_dir_seek,
+ .open = ll_dir_open,
+ .release = ll_dir_release,
+ .read = generic_read_dir,
+ .iterate = ll_readdir,
+ .unlocked_ioctl = ll_dir_ioctl,
+ .fsync = ll_fsync,
+};
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
new file mode 100644
index 000000000..4b44c634f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -0,0 +1,3624 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_lite.h"
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include "llite_internal.h"
+#include "../include/lustre/ll_fiemap.h"
+
+#include "../include/cl_object.h"
+
+static int
+ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+ bool *lease_broken);
+
+static enum llioc_iter
+ll_iocontrol_call(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg, int *rcp);
+
+static struct ll_file_data *ll_file_data_get(void)
+{
+ struct ll_file_data *fd;
+
+ OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
+ if (fd == NULL)
+ return NULL;
+ fd->fd_write_failed = false;
+ return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+ if (fd != NULL)
+ OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+ struct lustre_handle *fh)
+{
+ op_data->op_fid1 = ll_i2info(inode)->lli_fid;
+ op_data->op_attr.ia_mode = inode->i_mode;
+ op_data->op_attr.ia_atime = inode->i_atime;
+ op_data->op_attr.ia_mtime = inode->i_mtime;
+ op_data->op_attr.ia_ctime = inode->i_ctime;
+ op_data->op_attr.ia_size = i_size_read(inode);
+ op_data->op_attr_blocks = inode->i_blocks;
+ ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+ ll_inode_to_ext_flags(inode->i_flags);
+ op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
+ if (fh)
+ op_data->op_handle = *fh;
+ op_data->op_capa1 = ll_mdscapa_get(inode);
+
+ if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
+ op_data->op_bias |= MDS_DATA_MODIFIED;
+}
+
+/**
+ * Closes the IO epoch and packs all the attributes into @op_data for
+ * the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+ struct obd_client_handle *och)
+{
+ op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+ ATTR_MTIME | ATTR_MTIME_SET |
+ ATTR_CTIME | ATTR_CTIME_SET;
+
+ if (!(och->och_flags & FMODE_WRITE))
+ goto out;
+
+ if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
+ op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+ else
+ ll_ioepoch_close(inode, op_data, &och, 0);
+
+out:
+ ll_pack_inode2opdata(inode, op_data, &och->och_fh);
+ ll_prep_md_op_data(op_data, inode, NULL, NULL,
+ 0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+static int ll_close_inode_openhandle(struct obd_export *md_exp,
+ struct inode *inode,
+ struct obd_client_handle *och,
+ const __u64 *data_version)
+{
+ struct obd_export *exp = ll_i2mdexp(inode);
+ struct md_op_data *op_data;
+ struct ptlrpc_request *req = NULL;
+ struct obd_device *obd = class_exp2obd(exp);
+ int epoch_close = 1;
+ int rc;
+
+ if (obd == NULL) {
+ /*
+ * XXX: in case of LMV, is this correct to access
+ * ->exp_handle?
+ */
+ CERROR("Invalid MDC connection handle %#llx\n",
+ ll_i2mdexp(inode)->exp_handle.h_cookie);
+ rc = 0;
+ goto out;
+ }
+
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data) {
+ /* XXX We leak openhandle and request here. */
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ ll_prepare_close(inode, op_data, och);
+ if (data_version != NULL) {
+ /* Pass in data_version implies release. */
+ op_data->op_bias |= MDS_HSM_RELEASE;
+ op_data->op_data_version = *data_version;
+ op_data->op_lease_handle = och->och_lease_handle;
+ op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+ }
+ epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
+ rc = md_close(md_exp, op_data, och->och_mod, &req);
+ if (rc == -EAGAIN) {
+ /* This close must have the epoch closed. */
+ LASSERT(epoch_close);
+ /* MDS has instructed us to obtain Size-on-MDS attribute from
+ * OSTs and send setattr to back to MDS. */
+ rc = ll_som_update(inode, op_data);
+ if (rc) {
+ CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
+ inode->i_ino, rc);
+ rc = 0;
+ }
+ } else if (rc) {
+ CERROR("inode %lu mdc close failed: rc = %d\n",
+ inode->i_ino, rc);
+ }
+
+ /* DATA_MODIFIED flag was successfully sent on close, cancel data
+ * modification flag. */
+ if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+ spin_unlock(&lli->lli_lock);
+ }
+
+ if (rc == 0) {
+ rc = ll_objects_destroy(req, inode);
+ if (rc)
+ CERROR("inode %lu ll_objects destroy: rc = %d\n",
+ inode->i_ino, rc);
+ }
+ if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
+ struct mdt_body *body;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (!(body->valid & OBD_MD_FLRELEASED))
+ rc = -EBUSY;
+ }
+
+ ll_finish_md_op_data(op_data);
+
+out:
+ if (exp_connect_som(exp) && !epoch_close &&
+ S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
+ ll_queue_done_writing(inode, LLIF_DONE_WRITING);
+ } else {
+ md_clear_open_replay_data(md_exp, och);
+ /* Free @och if it is not waiting for DONE_WRITING. */
+ och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+ OBD_FREE_PTR(och);
+ }
+ if (req) /* This is close request */
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+int ll_md_real_close(struct inode *inode, fmode_t fmode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_client_handle **och_p;
+ struct obd_client_handle *och;
+ __u64 *och_usecount;
+ int rc = 0;
+
+ if (fmode & FMODE_WRITE) {
+ och_p = &lli->lli_mds_write_och;
+ och_usecount = &lli->lli_open_fd_write_count;
+ } else if (fmode & FMODE_EXEC) {
+ och_p = &lli->lli_mds_exec_och;
+ och_usecount = &lli->lli_open_fd_exec_count;
+ } else {
+ LASSERT(fmode & FMODE_READ);
+ och_p = &lli->lli_mds_read_och;
+ och_usecount = &lli->lli_open_fd_read_count;
+ }
+
+ mutex_lock(&lli->lli_och_mutex);
+ if (*och_usecount > 0) {
+ /* There are still users of this handle, so skip
+ * freeing it. */
+ mutex_unlock(&lli->lli_och_mutex);
+ return 0;
+ }
+
+ och = *och_p;
+ *och_p = NULL;
+ mutex_unlock(&lli->lli_och_mutex);
+
+ if (och != NULL) {
+ /* There might be a race and this handle may already
+ be closed. */
+ rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+ inode, och, NULL);
+ }
+
+ return rc;
+}
+
+static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+ struct file *file)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ int lockmode;
+ __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+ struct lustre_handle lockh;
+ ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN}};
+ int rc = 0;
+
+ /* clear group lock, if present */
+ if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+ ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
+
+ if (fd->fd_lease_och != NULL) {
+ bool lease_broken;
+
+ /* Usually the lease is not released when the
+ * application crashed, we need to release here. */
+ rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
+ CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
+ PFID(&lli->lli_fid), rc, lease_broken);
+
+ fd->fd_lease_och = NULL;
+ }
+
+ if (fd->fd_och != NULL) {
+ rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
+ fd->fd_och = NULL;
+ goto out;
+ }
+
+ /* Let's see if we have good enough OPEN lock on the file and if
+ we can skip talking to MDS */
+
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_omode & FMODE_WRITE) {
+ lockmode = LCK_CW;
+ LASSERT(lli->lli_open_fd_write_count);
+ lli->lli_open_fd_write_count--;
+ } else if (fd->fd_omode & FMODE_EXEC) {
+ lockmode = LCK_PR;
+ LASSERT(lli->lli_open_fd_exec_count);
+ lli->lli_open_fd_exec_count--;
+ } else {
+ lockmode = LCK_CR;
+ LASSERT(lli->lli_open_fd_read_count);
+ lli->lli_open_fd_read_count--;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+
+ if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
+ LDLM_IBITS, &policy, lockmode, &lockh))
+ rc = ll_md_real_close(inode, fd->fd_omode);
+
+out:
+ LUSTRE_FPRIVATE(file) = NULL;
+ ll_file_data_put(fd);
+ ll_capa_close(inode);
+
+ return rc;
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here. Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+ struct ll_file_data *fd;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+ inode->i_generation, inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+ LASSERT(fd != NULL);
+ if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
+ fd->fd_flags &= ~LL_FILE_RMTACL;
+ rct_del(&sbi->ll_rct, current_pid());
+ et_search_free(&sbi->ll_et, current_pid());
+ }
+ }
+#endif
+
+ if (!is_root_inode(inode))
+ ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
+ fd = LUSTRE_FPRIVATE(file);
+ LASSERT(fd != NULL);
+
+ /* The last ref on @file, maybe not the owner pid of statahead.
+ * Different processes can open the same dir, "ll_opendir_key" means:
+ * it is me that should stop the statahead thread. */
+ if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
+ lli->lli_opendir_pid != 0)
+ ll_stop_statahead(inode, lli->lli_opendir_key);
+
+ if (is_root_inode(inode)) {
+ LUSTRE_FPRIVATE(file) = NULL;
+ ll_file_data_put(fd);
+ return 0;
+ }
+
+ if (!S_ISDIR(inode->i_mode)) {
+ lov_read_and_clear_async_rc(lli->lli_clob);
+ lli->lli_async_rc = 0;
+ }
+
+ rc = ll_md_close(sbi->ll_md_exp, inode, file);
+
+ if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+ libcfs_debug_dumplog();
+
+ return rc;
+}
+
+static int ll_intent_file_open(struct dentry *dentry, void *lmm,
+ int lmmsize, struct lookup_intent *itp)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct dentry *parent = dentry->d_parent;
+ const char *name = dentry->d_name.name;
+ const int len = dentry->d_name.len;
+ struct md_op_data *op_data;
+ struct ptlrpc_request *req;
+ __u32 opc = LUSTRE_OPC_ANY;
+ int rc;
+
+ /* Usually we come here only for NFSD, and we want open lock.
+ But we can also get here with pre 2.6.15 patchless kernels, and in
+ that case that lock is also ok */
+ /* We can also get here if there was cached open handle in revalidate_it
+ * but it disappeared while we were getting from there to ll_file_open.
+ * But this means this file was closed and immediately opened which
+ * makes a good candidate for using OPEN lock */
+ /* If lmmsize & lmm are not 0, we are just setting stripe info
+ * parameters. No need for the open lock */
+ if (lmm == NULL && lmmsize == 0) {
+ itp->it_flags |= MDS_OPEN_LOCK;
+ if (itp->it_flags & FMODE_WRITE)
+ opc = LUSTRE_OPC_CREATE;
+ }
+
+ op_data = ll_prep_md_op_data(NULL, d_inode(parent),
+ inode, name, len,
+ O_RDWR, opc, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ itp->it_flags |= MDS_OPEN_BY_FID;
+ rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
+ 0 /*unused */, &req, ll_md_blocking_ast, 0);
+ ll_finish_md_op_data(op_data);
+ if (rc == -ESTALE) {
+ /* reason for keep own exit path - don`t flood log
+ * with messages with -ESTALE errors.
+ */
+ if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+ it_open_error(DISP_OPEN_OPEN, itp))
+ goto out;
+ ll_release_openhandle(inode, itp);
+ goto out;
+ }
+
+ if (it_disposition(itp, DISP_LOOKUP_NEG)) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+ rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+ CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+ goto out;
+ }
+
+ rc = ll_prep_inode(&inode, req, NULL, itp);
+ if (!rc && itp->d.lustre.it_lock_mode)
+ ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
+
+out:
+ ptlrpc_req_finished(req);
+ ll_intent_drop_lock(itp);
+
+ return rc;
+}
+
+/**
+ * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
+ * not believe attributes if a few ioepoch holders exist. Attributes for
+ * previous ioepoch if new one is opened are also skipped by MDS.
+ */
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
+{
+ if (ioepoch && lli->lli_ioepoch != ioepoch) {
+ lli->lli_ioepoch = ioepoch;
+ CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
+ ioepoch, PFID(&lli->lli_fid));
+ }
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
+ struct obd_client_handle *och)
+{
+ struct ptlrpc_request *req = it->d.lustre.it_data;
+ struct mdt_body *body;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ och->och_fh = body->handle;
+ och->och_fid = body->fid1;
+ och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
+ och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+ och->och_flags = it->it_flags;
+
+ return md_set_open_replay_data(md_exp, och, it);
+}
+
+static int ll_local_open(struct file *file, struct lookup_intent *it,
+ struct ll_file_data *fd, struct obd_client_handle *och)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ LASSERT(!LUSTRE_FPRIVATE(file));
+
+ LASSERT(fd != NULL);
+
+ if (och) {
+ struct ptlrpc_request *req = it->d.lustre.it_data;
+ struct mdt_body *body;
+ int rc;
+
+ rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+ if (rc != 0)
+ return rc;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ ll_ioepoch_open(lli, body->ioepoch);
+ }
+
+ LUSTRE_FPRIVATE(file) = fd;
+ ll_readahead_init(inode, &fd->fd_ras);
+ fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+ return 0;
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used. We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+ .it_flags = file->f_flags };
+ struct obd_client_handle **och_p = NULL;
+ __u64 *och_usecount = NULL;
+ struct ll_file_data *fd;
+ int rc = 0, opendir_set = 0;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
+ inode->i_generation, inode, file->f_flags);
+
+ it = file->private_data; /* XXX: compat macro */
+ file->private_data = NULL; /* prevent ll_local_open assertion */
+
+ fd = ll_file_data_get();
+ if (fd == NULL) {
+ rc = -ENOMEM;
+ goto out_openerr;
+ }
+
+ fd->fd_file = file;
+ if (S_ISDIR(inode->i_mode)) {
+ spin_lock(&lli->lli_sa_lock);
+ if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
+ lli->lli_opendir_pid == 0) {
+ lli->lli_opendir_key = fd;
+ lli->lli_opendir_pid = current_pid();
+ opendir_set = 1;
+ }
+ spin_unlock(&lli->lli_sa_lock);
+ }
+
+ if (is_root_inode(inode)) {
+ LUSTRE_FPRIVATE(file) = fd;
+ return 0;
+ }
+
+ if (!it || !it->d.lustre.it_disposition) {
+ /* Convert f_flags into access mode. We cannot use file->f_mode,
+ * because everything but O_ACCMODE mask was stripped from
+ * there */
+ if ((oit.it_flags + 1) & O_ACCMODE)
+ oit.it_flags++;
+ if (file->f_flags & O_TRUNC)
+ oit.it_flags |= FMODE_WRITE;
+
+ /* kernel only call f_op->open in dentry_open. filp_open calls
+ * dentry_open after call to open_namei that checks permissions.
+ * Only nfsd_open call dentry_open directly without checking
+ * permissions and because of that this code below is safe. */
+ if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+ oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+ /* We do not want O_EXCL here, presumably we opened the file
+ * already? XXX - NFS implications? */
+ oit.it_flags &= ~O_EXCL;
+
+ /* bug20584, if "it_flags" contains O_CREAT, the file will be
+ * created if necessary, then "IT_CREAT" should be set to keep
+ * consistent with it */
+ if (oit.it_flags & O_CREAT)
+ oit.it_op |= IT_CREAT;
+
+ it = &oit;
+ }
+
+restart:
+ /* Let's see if we have file open on MDS already. */
+ if (it->it_flags & FMODE_WRITE) {
+ och_p = &lli->lli_mds_write_och;
+ och_usecount = &lli->lli_open_fd_write_count;
+ } else if (it->it_flags & FMODE_EXEC) {
+ och_p = &lli->lli_mds_exec_och;
+ och_usecount = &lli->lli_open_fd_exec_count;
+ } else {
+ och_p = &lli->lli_mds_read_och;
+ och_usecount = &lli->lli_open_fd_read_count;
+ }
+
+ mutex_lock(&lli->lli_och_mutex);
+ if (*och_p) { /* Open handle is present */
+ if (it_disposition(it, DISP_OPEN_OPEN)) {
+ /* Well, there's extra open request that we do not need,
+ let's close it somehow. This will decref request. */
+ rc = it_open_error(DISP_OPEN_OPEN, it);
+ if (rc) {
+ mutex_unlock(&lli->lli_och_mutex);
+ goto out_openerr;
+ }
+
+ ll_release_openhandle(inode, it);
+ }
+ (*och_usecount)++;
+
+ rc = ll_local_open(file, it, fd, NULL);
+ if (rc) {
+ (*och_usecount)--;
+ mutex_unlock(&lli->lli_och_mutex);
+ goto out_openerr;
+ }
+ } else {
+ LASSERT(*och_usecount == 0);
+ if (!it->d.lustre.it_disposition) {
+ /* We cannot just request lock handle now, new ELC code
+ means that one of other OPEN locks for this file
+ could be cancelled, and since blocking ast handler
+ would attempt to grab och_mutex as well, that would
+ result in a deadlock */
+ mutex_unlock(&lli->lli_och_mutex);
+ it->it_create_mode |= M_CHECK_STALE;
+ rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
+ it->it_create_mode &= ~M_CHECK_STALE;
+ if (rc)
+ goto out_openerr;
+
+ goto restart;
+ }
+ *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
+ if (!*och_p) {
+ rc = -ENOMEM;
+ goto out_och_free;
+ }
+
+ (*och_usecount)++;
+
+ /* md_intent_lock() didn't get a request ref if there was an
+ * open error, so don't do cleanup on the request here
+ * (bug 3430) */
+ /* XXX (green): Should not we bail out on any error here, not
+ * just open error? */
+ rc = it_open_error(DISP_OPEN_OPEN, it);
+ if (rc)
+ goto out_och_free;
+
+ LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
+
+ rc = ll_local_open(file, it, fd, *och_p);
+ if (rc)
+ goto out_och_free;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+ fd = NULL;
+
+ /* Must do this outside lli_och_mutex lock to prevent deadlock where
+ different kind of OPEN lock for this same inode gets cancelled
+ by ldlm_cancel_lru */
+ if (!S_ISREG(inode->i_mode))
+ goto out_och_free;
+
+ ll_capa_open(inode);
+
+ if (!lli->lli_has_smd &&
+ (cl_is_lov_delay_create(file->f_flags) ||
+ (file->f_mode & FMODE_WRITE) == 0)) {
+ CDEBUG(D_INODE, "object creation was delayed\n");
+ goto out_och_free;
+ }
+ cl_lov_delay_create_clear(&file->f_flags);
+ goto out_och_free;
+
+out_och_free:
+ if (rc) {
+ if (och_p && *och_p) {
+ OBD_FREE(*och_p, sizeof(struct obd_client_handle));
+ *och_p = NULL; /* OBD_FREE writes some magic there */
+ (*och_usecount)--;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+ if (opendir_set != 0)
+ ll_stop_statahead(inode, lli->lli_opendir_key);
+ if (fd != NULL)
+ ll_file_data_put(fd);
+ } else {
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
+ }
+
+ if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+ ptlrpc_req_finished(it->d.lustre.it_data);
+ it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+ }
+
+ return rc;
+}
+
+static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
+ struct ldlm_lock_desc *desc, void *data, int flag)
+{
+ int rc;
+ struct lustre_handle lockh;
+
+ switch (flag) {
+ case LDLM_CB_BLOCKING:
+ ldlm_lock2handle(lock, &lockh);
+ rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+ if (rc < 0) {
+ CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+ return rc;
+ }
+ break;
+ case LDLM_CB_CANCELING:
+ /* do nothing */
+ break;
+ }
+ return 0;
+}
+
+/**
+ * Acquire a lease and open the file.
+ */
+static struct obd_client_handle *
+ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
+ __u64 open_flags)
+{
+ struct lookup_intent it = { .it_op = IT_OPEN };
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct md_op_data *op_data;
+ struct ptlrpc_request *req;
+ struct lustre_handle old_handle = { 0 };
+ struct obd_client_handle *och = NULL;
+ int rc;
+ int rc2;
+
+ if (fmode != FMODE_WRITE && fmode != FMODE_READ)
+ return ERR_PTR(-EINVAL);
+
+ if (file != NULL) {
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct obd_client_handle **och_p;
+ __u64 *och_usecount;
+
+ if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
+ return ERR_PTR(-EPERM);
+
+ /* Get the openhandle of the file */
+ rc = -EBUSY;
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_lease_och != NULL) {
+ mutex_unlock(&lli->lli_och_mutex);
+ return ERR_PTR(rc);
+ }
+
+ if (fd->fd_och == NULL) {
+ if (file->f_mode & FMODE_WRITE) {
+ LASSERT(lli->lli_mds_write_och != NULL);
+ och_p = &lli->lli_mds_write_och;
+ och_usecount = &lli->lli_open_fd_write_count;
+ } else {
+ LASSERT(lli->lli_mds_read_och != NULL);
+ och_p = &lli->lli_mds_read_och;
+ och_usecount = &lli->lli_open_fd_read_count;
+ }
+ if (*och_usecount == 1) {
+ fd->fd_och = *och_p;
+ *och_p = NULL;
+ *och_usecount = 0;
+ rc = 0;
+ }
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+ if (rc < 0) /* more than 1 opener */
+ return ERR_PTR(rc);
+
+ LASSERT(fd->fd_och != NULL);
+ old_handle = fd->fd_och->och_fh;
+ }
+
+ och = kzalloc(sizeof(*och), GFP_NOFS);
+ if (!och)
+ return ERR_PTR(-ENOMEM);
+
+ op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data)) {
+ rc = PTR_ERR(op_data);
+ goto out;
+ }
+
+ /* To tell the MDT this openhandle is from the same owner */
+ op_data->op_handle = old_handle;
+
+ it.it_flags = fmode | open_flags;
+ it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
+ rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
+ ll_md_blocking_lease_ast,
+ /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
+ * it can be cancelled which may mislead applications that the lease is
+ * broken;
+ * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
+ * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
+ * doesn't deal with openhandle, so normal openhandle will be leaked. */
+ LDLM_FL_NO_LRU | LDLM_FL_EXCL);
+ ll_finish_md_op_data(op_data);
+ ptlrpc_req_finished(req);
+ if (rc < 0)
+ goto out_release_it;
+
+ if (it_disposition(&it, DISP_LOOKUP_NEG)) {
+ rc = -ENOENT;
+ goto out_release_it;
+ }
+
+ rc = it_open_error(DISP_OPEN_OPEN, &it);
+ if (rc)
+ goto out_release_it;
+
+ LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
+ ll_och_fill(sbi->ll_md_exp, &it, och);
+
+ if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
+ rc = -EOPNOTSUPP;
+ goto out_close;
+ }
+
+ /* already get lease, handle lease lock */
+ ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+ if (it.d.lustre.it_lock_mode == 0 ||
+ it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
+ /* open lock must return for lease */
+ CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
+ PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
+ it.d.lustre.it_lock_bits);
+ rc = -EPROTO;
+ goto out_close;
+ }
+
+ ll_intent_release(&it);
+ return och;
+
+out_close:
+ rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
+ if (rc2)
+ CERROR("Close openhandle returned %d\n", rc2);
+
+ /* cancel open lock */
+ if (it.d.lustre.it_lock_mode != 0) {
+ ldlm_lock_decref_and_cancel(&och->och_lease_handle,
+ it.d.lustre.it_lock_mode);
+ it.d.lustre.it_lock_mode = 0;
+ }
+out_release_it:
+ ll_intent_release(&it);
+out:
+ OBD_FREE_PTR(och);
+ return ERR_PTR(rc);
+}
+
+/**
+ * Release lease and close the file.
+ * It will check if the lease has ever broken.
+ */
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+ bool *lease_broken)
+{
+ struct ldlm_lock *lock;
+ bool cancelled = true;
+ int rc;
+
+ lock = ldlm_handle2lock(&och->och_lease_handle);
+ if (lock != NULL) {
+ lock_res_and_lock(lock);
+ cancelled = ldlm_is_cancel(lock);
+ unlock_res_and_lock(lock);
+ ldlm_lock_put(lock);
+ }
+
+ CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
+ PFID(&ll_i2info(inode)->lli_fid), cancelled);
+
+ if (!cancelled)
+ ldlm_cli_cancel(&och->och_lease_handle, 0);
+ if (lease_broken != NULL)
+ *lease_broken = cancelled;
+
+ rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
+ NULL);
+ return rc;
+}
+
+/* Fills the obdo with the attributes for the lsm */
+static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
+ struct obd_capa *capa, struct obdo *obdo,
+ __u64 ioepoch, int sync)
+{
+ struct ptlrpc_request_set *set;
+ struct obd_info oinfo = { { { 0 } } };
+ int rc;
+
+ LASSERT(lsm != NULL);
+
+ oinfo.oi_md = lsm;
+ oinfo.oi_oa = obdo;
+ oinfo.oi_oa->o_oi = lsm->lsm_oi;
+ oinfo.oi_oa->o_mode = S_IFREG;
+ oinfo.oi_oa->o_ioepoch = ioepoch;
+ oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
+ OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+ OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+ OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
+ OBD_MD_FLDATAVERSION;
+ oinfo.oi_capa = capa;
+ if (sync) {
+ oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
+ oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
+ }
+
+ set = ptlrpc_prep_set();
+ if (set == NULL) {
+ CERROR("can't allocate ptlrpc set\n");
+ rc = -ENOMEM;
+ } else {
+ rc = obd_getattr_async(exp, &oinfo, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+ }
+ if (rc == 0)
+ oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+ OBD_MD_FLATIME | OBD_MD_FLMTIME |
+ OBD_MD_FLCTIME | OBD_MD_FLSIZE |
+ OBD_MD_FLDATAVERSION);
+ return rc;
+}
+
+/**
+ * Performs the getattr on the inode and updates its fields.
+ * If @sync != 0, perform the getattr under the server-side lock.
+ */
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+ __u64 ioepoch, int sync)
+{
+ struct obd_capa *capa = ll_mdscapa_get(inode);
+ struct lov_stripe_md *lsm;
+ int rc;
+
+ lsm = ccc_inode_lsm_get(inode);
+ rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
+ capa, obdo, ioepoch, sync);
+ capa_put(capa);
+ if (rc == 0) {
+ struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
+
+ obdo_refresh_inode(inode, obdo, obdo->o_valid);
+ CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
+ POSTID(oi), i_size_read(inode),
+ (unsigned long long)inode->i_blocks,
+ 1UL << inode->i_blkbits);
+ }
+ ccc_inode_lsm_put(inode, lsm);
+ return rc;
+}
+
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct cl_object *obj = lli->lli_clob;
+ struct cl_attr *attr = ccc_env_thread_attr(env);
+ struct ost_lvb lvb;
+ int rc = 0;
+
+ ll_inode_size_lock(inode);
+ /* merge timestamps the most recently obtained from mds with
+ timestamps obtained from osts */
+ LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
+ LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
+ LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
+
+ lvb.lvb_size = i_size_read(inode);
+ lvb.lvb_blocks = inode->i_blocks;
+ lvb.lvb_mtime = LTIME_S(inode->i_mtime);
+ lvb.lvb_atime = LTIME_S(inode->i_atime);
+ lvb.lvb_ctime = LTIME_S(inode->i_ctime);
+
+ cl_object_attr_lock(obj);
+ rc = cl_object_attr_get(env, obj, attr);
+ cl_object_attr_unlock(obj);
+
+ if (rc == 0) {
+ if (lvb.lvb_atime < attr->cat_atime)
+ lvb.lvb_atime = attr->cat_atime;
+ if (lvb.lvb_ctime < attr->cat_ctime)
+ lvb.lvb_ctime = attr->cat_ctime;
+ if (lvb.lvb_mtime < attr->cat_mtime)
+ lvb.lvb_mtime = attr->cat_mtime;
+
+ CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
+ PFID(&lli->lli_fid), attr->cat_size);
+ cl_isize_write_nolock(inode, attr->cat_size);
+
+ inode->i_blocks = attr->cat_blocks;
+
+ LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
+ LTIME_S(inode->i_atime) = lvb.lvb_atime;
+ LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
+ }
+ ll_inode_size_unlock(inode);
+
+ return rc;
+}
+
+int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
+ lstat_t *st)
+{
+ struct obdo obdo = { 0 };
+ int rc;
+
+ rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
+ if (rc == 0) {
+ st->st_size = obdo.o_size;
+ st->st_blocks = obdo.o_blocks;
+ st->st_mtime = obdo.o_mtime;
+ st->st_atime = obdo.o_atime;
+ st->st_ctime = obdo.o_ctime;
+ }
+ return rc;
+}
+
+static bool file_is_noatime(const struct file *file)
+{
+ const struct vfsmount *mnt = file->f_path.mnt;
+ const struct inode *inode = file_inode(file);
+
+ /* Adapted from file_accessed() and touch_atime().*/
+ if (file->f_flags & O_NOATIME)
+ return true;
+
+ if (inode->i_flags & S_NOATIME)
+ return true;
+
+ if (IS_NOATIME(inode))
+ return true;
+
+ if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
+ return true;
+
+ if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+ return true;
+
+ if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
+ return true;
+
+ return false;
+}
+
+void ll_io_init(struct cl_io *io, const struct file *file, int write)
+{
+ struct inode *inode = file_inode(file);
+
+ io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+ if (write) {
+ io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+ io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
+ file->f_flags & O_DIRECT ||
+ IS_SYNC(inode);
+ }
+ io->ci_obj = ll_i2info(inode)->lli_clob;
+ io->ci_lockreq = CILR_MAYBE;
+ if (ll_file_nolock(file)) {
+ io->ci_lockreq = CILR_NEVER;
+ io->ci_no_srvlock = 1;
+ } else if (file->f_flags & O_APPEND) {
+ io->ci_lockreq = CILR_MANDATORY;
+ }
+
+ io->ci_noatime = file_is_noatime(file);
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+ struct file *file, enum cl_io_type iot,
+ loff_t *ppos, size_t count)
+{
+ struct ll_inode_info *lli = ll_i2info(file_inode(file));
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct cl_io *io;
+ ssize_t result;
+
+restart:
+ io = ccc_env_thread_io(env);
+ ll_io_init(io, file, iot == CIT_WRITE);
+
+ if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+ struct vvp_io *vio = vvp_env_io(env);
+ struct ccc_io *cio = ccc_env_io(env);
+ int write_mutex_locked = 0;
+
+ cio->cui_fd = LUSTRE_FPRIVATE(file);
+ vio->cui_io_subtype = args->via_io_subtype;
+
+ switch (vio->cui_io_subtype) {
+ case IO_NORMAL:
+ cio->cui_iter = args->u.normal.via_iter;
+ cio->cui_iocb = args->u.normal.via_iocb;
+ if ((iot == CIT_WRITE) &&
+ !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+ if (mutex_lock_interruptible(&lli->
+ lli_write_mutex)) {
+ result = -ERESTARTSYS;
+ goto out;
+ }
+ write_mutex_locked = 1;
+ } else if (iot == CIT_READ) {
+ down_read(&lli->lli_trunc_sem);
+ }
+ break;
+ case IO_SPLICE:
+ vio->u.splice.cui_pipe = args->u.splice.via_pipe;
+ vio->u.splice.cui_flags = args->u.splice.via_flags;
+ break;
+ default:
+ CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
+ LBUG();
+ }
+ result = cl_io_loop(env, io);
+ if (write_mutex_locked)
+ mutex_unlock(&lli->lli_write_mutex);
+ else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
+ up_read(&lli->lli_trunc_sem);
+ } else {
+ /* cl_io_rw_init() handled IO */
+ result = io->ci_result;
+ }
+
+ if (io->ci_nob > 0) {
+ result = io->ci_nob;
+ *ppos = io->u.ci_wr.wr.crw_pos;
+ }
+ goto out;
+out:
+ cl_io_fini(env, io);
+ /* If any bit been read/written (result != 0), we just return
+ * short read/write instead of restart io. */
+ if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
+ CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
+ iot == CIT_READ ? "read" : "write",
+ file, *ppos, count);
+ LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
+ goto restart;
+ }
+
+ if (iot == CIT_READ) {
+ if (result >= 0)
+ ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
+ LPROC_LL_READ_BYTES, result);
+ } else if (iot == CIT_WRITE) {
+ if (result >= 0) {
+ ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
+ LPROC_LL_WRITE_BYTES, result);
+ fd->fd_write_failed = false;
+ } else if (result != -ERESTARTSYS) {
+ fd->fd_write_failed = true;
+ }
+ }
+
+ return result;
+}
+
+static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct lu_env *env;
+ struct vvp_io_args *args;
+ ssize_t result;
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ args = vvp_env_args(env, IO_NORMAL);
+ args->u.normal.via_iter = to;
+ args->u.normal.via_iocb = iocb;
+
+ result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+ &iocb->ki_pos, iov_iter_count(to));
+ cl_env_put(env, &refcheck);
+ return result;
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct lu_env *env;
+ struct vvp_io_args *args;
+ ssize_t result;
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ args = vvp_env_args(env, IO_NORMAL);
+ args->u.normal.via_iter = from;
+ args->u.normal.via_iocb = iocb;
+
+ result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+ &iocb->ki_pos, iov_iter_count(from));
+ cl_env_put(env, &refcheck);
+ return result;
+}
+
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t count,
+ unsigned int flags)
+{
+ struct lu_env *env;
+ struct vvp_io_args *args;
+ ssize_t result;
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ args = vvp_env_args(env, IO_SPLICE);
+ args->u.splice.via_pipe = pipe;
+ args->u.splice.via_flags = flags;
+
+ result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+ cl_env_put(env, &refcheck);
+ return result;
+}
+
+static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
+{
+ struct obd_export *exp = ll_i2dtexp(inode);
+ struct obd_trans_info oti = { 0 };
+ struct obdo *oa = NULL;
+ int lsm_size;
+ int rc = 0;
+ struct lov_stripe_md *lsm = NULL, *lsm2;
+
+ OBDO_ALLOC(oa);
+ if (oa == NULL)
+ return -ENOMEM;
+
+ lsm = ccc_inode_lsm_get(inode);
+ if (!lsm_has_objects(lsm)) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
+ (lsm->lsm_stripe_count));
+
+ OBD_ALLOC_LARGE(lsm2, lsm_size);
+ if (lsm2 == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ oa->o_oi = *oi;
+ oa->o_nlink = ost_idx;
+ oa->o_flags |= OBD_FL_RECREATE_OBJS;
+ oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
+ obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+ obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+ memcpy(lsm2, lsm, lsm_size);
+ ll_inode_size_lock(inode);
+ rc = obd_create(NULL, exp, oa, &lsm2, &oti);
+ ll_inode_size_unlock(inode);
+
+ OBD_FREE_LARGE(lsm2, lsm_size);
+ goto out;
+out:
+ ccc_inode_lsm_put(inode, lsm);
+ OBDO_FREE(oa);
+ return rc;
+}
+
+static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
+{
+ struct ll_recreate_obj ucreat;
+ struct ost_id oi;
+
+ if (!capable(CFS_CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
+ sizeof(ucreat)))
+ return -EFAULT;
+
+ ostid_set_seq_mdt0(&oi);
+ ostid_set_id(&oi, ucreat.lrc_id);
+ return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
+}
+
+static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
+{
+ struct lu_fid fid;
+ struct ost_id oi;
+ u32 ost_idx;
+
+ if (!capable(CFS_CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
+ return -EFAULT;
+
+ fid_to_ostid(&fid, &oi);
+ ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
+ return ll_lov_recreate(inode, &oi, ost_idx);
+}
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+ int flags, struct lov_user_md *lum, int lum_size)
+{
+ struct lov_stripe_md *lsm = NULL;
+ struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
+ int rc = 0;
+
+ lsm = ccc_inode_lsm_get(inode);
+ if (lsm != NULL) {
+ ccc_inode_lsm_put(inode, lsm);
+ CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
+ inode->i_ino);
+ rc = -EEXIST;
+ goto out;
+ }
+
+ ll_inode_size_lock(inode);
+ rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
+ if (rc)
+ goto out_unlock;
+ rc = oit.d.lustre.it_status;
+ if (rc < 0)
+ goto out_req_free;
+
+ ll_release_openhandle(inode, &oit);
+
+out_unlock:
+ ll_inode_size_unlock(inode);
+ ll_intent_release(&oit);
+ ccc_inode_lsm_put(inode, lsm);
+out:
+ return rc;
+out_req_free:
+ ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
+ goto out;
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+ struct lov_mds_md **lmmp, int *lmm_size,
+ struct ptlrpc_request **request)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct mdt_body *body;
+ struct lov_mds_md *lmm = NULL;
+ struct ptlrpc_request *req = NULL;
+ struct md_op_data *op_data;
+ int rc, lmmsize;
+
+ rc = ll_get_default_mdsize(sbi, &lmmsize);
+ if (rc)
+ return rc;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+ strlen(filename), lmmsize,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+ rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+ ll_finish_md_op_data(op_data);
+ if (rc < 0) {
+ CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
+ filename, rc);
+ goto out;
+ }
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+ lmmsize = body->eadatasize;
+
+ if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+ lmmsize == 0) {
+ rc = -ENODATA;
+ goto out;
+ }
+
+ lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+ LASSERT(lmm != NULL);
+
+ if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+ (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ /*
+ * This is coming from the MDS, so is probably in
+ * little endian. We convert it to host endian before
+ * passing it to userspace.
+ */
+ if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+ int stripe_count;
+
+ stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+ if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
+ stripe_count = 0;
+
+ /* if function called for directory - we should
+ * avoid swab not existent lsm objects */
+ if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+ lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+ if (S_ISREG(body->mode))
+ lustre_swab_lov_user_md_objects(
+ ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+ stripe_count);
+ } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+ lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+ if (S_ISREG(body->mode))
+ lustre_swab_lov_user_md_objects(
+ ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+ stripe_count);
+ }
+ }
+
+out:
+ *lmmp = lmm;
+ *lmm_size = lmmsize;
+ *request = req;
+ return rc;
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+ unsigned long arg)
+{
+ int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+ struct lov_user_md *lump;
+ int lum_size = sizeof(struct lov_user_md) +
+ sizeof(struct lov_user_ost_data);
+ int rc;
+
+ if (!capable(CFS_CAP_SYS_ADMIN))
+ return -EPERM;
+
+ OBD_ALLOC_LARGE(lump, lum_size);
+ if (lump == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
+ OBD_FREE_LARGE(lump, lum_size);
+ return -EFAULT;
+ }
+
+ rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
+ lum_size);
+ cl_lov_delay_create_clear(&file->f_flags);
+
+ OBD_FREE_LARGE(lump, lum_size);
+ return rc;
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+ unsigned long arg)
+{
+ struct lov_user_md_v3 lumv3;
+ struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+ struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+ struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+ int lum_size, rc;
+ int flags = FMODE_WRITE;
+
+ /* first try with v1 which is smaller than v3 */
+ lum_size = sizeof(struct lov_user_md_v1);
+ if (copy_from_user(lumv1, lumv1p, lum_size))
+ return -EFAULT;
+
+ if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+ lum_size = sizeof(struct lov_user_md_v3);
+ if (copy_from_user(&lumv3, lumv3p, lum_size))
+ return -EFAULT;
+ }
+
+ rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
+ lum_size);
+ cl_lov_delay_create_clear(&file->f_flags);
+ if (rc == 0) {
+ struct lov_stripe_md *lsm;
+ __u32 gen;
+
+ put_user(0, &lumv1p->lmm_stripe_count);
+
+ ll_layout_refresh(inode, &gen);
+ lsm = ccc_inode_lsm_get(inode);
+ rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
+ 0, lsm, (void *)arg);
+ ccc_inode_lsm_put(inode, lsm);
+ }
+ return rc;
+}
+
+static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
+{
+ struct lov_stripe_md *lsm;
+ int rc = -ENODATA;
+
+ lsm = ccc_inode_lsm_get(inode);
+ if (lsm != NULL)
+ rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
+ lsm, (void *)arg);
+ ccc_inode_lsm_put(inode, lsm);
+ return rc;
+}
+
+static int
+ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct ccc_grouplock grouplock;
+ int rc;
+
+ if (arg == 0) {
+ CWARN("group id for group lock must not be 0\n");
+ return -EINVAL;
+ }
+
+ if (ll_file_nolock(file))
+ return -EOPNOTSUPP;
+
+ spin_lock(&lli->lli_lock);
+ if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+ CWARN("group lock already existed with gid %lu\n",
+ fd->fd_grouplock.cg_gid);
+ spin_unlock(&lli->lli_lock);
+ return -EINVAL;
+ }
+ LASSERT(fd->fd_grouplock.cg_lock == NULL);
+ spin_unlock(&lli->lli_lock);
+
+ rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+ arg, (file->f_flags & O_NONBLOCK), &grouplock);
+ if (rc)
+ return rc;
+
+ spin_lock(&lli->lli_lock);
+ if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+ spin_unlock(&lli->lli_lock);
+ CERROR("another thread just won the race\n");
+ cl_put_grouplock(&grouplock);
+ return -EINVAL;
+ }
+
+ fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+ fd->fd_grouplock = grouplock;
+ spin_unlock(&lli->lli_lock);
+
+ CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+ return 0;
+}
+
+static int ll_put_grouplock(struct inode *inode, struct file *file,
+ unsigned long arg)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct ccc_grouplock grouplock;
+
+ spin_lock(&lli->lli_lock);
+ if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+ spin_unlock(&lli->lli_lock);
+ CWARN("no group lock held\n");
+ return -EINVAL;
+ }
+ LASSERT(fd->fd_grouplock.cg_lock != NULL);
+
+ if (fd->fd_grouplock.cg_gid != arg) {
+ CWARN("group lock %lu doesn't match current id %lu\n",
+ arg, fd->fd_grouplock.cg_gid);
+ spin_unlock(&lli->lli_lock);
+ return -EINVAL;
+ }
+
+ grouplock = fd->fd_grouplock;
+ memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+ fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+ spin_unlock(&lli->lli_lock);
+
+ cl_put_grouplock(&grouplock);
+ CDEBUG(D_INFO, "group lock %lu released\n", arg);
+ return 0;
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param inode [in] inode in question
+ * \param it [in,out] intent which contains open info and result
+ *
+ * \retval 0 success
+ * \retval <0 failure
+ */
+int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
+{
+ struct obd_client_handle *och;
+ int rc;
+
+ LASSERT(inode);
+
+ /* Root ? Do nothing. */
+ if (is_root_inode(inode))
+ return 0;
+
+ /* No open handle to close? Move away */
+ if (!it_disposition(it, DISP_OPEN_OPEN))
+ return 0;
+
+ LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+ och = kzalloc(sizeof(*och), GFP_NOFS);
+ if (!och) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
+
+ rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+ inode, och, NULL);
+out:
+ /* this one is in place of ll_file_open */
+ if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+ ptlrpc_req_finished(it->d.lustre.it_data);
+ it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+ }
+ return rc;
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ */
+static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+ size_t num_bytes)
+{
+ struct obd_export *exp = ll_i2dtexp(inode);
+ struct lov_stripe_md *lsm = NULL;
+ struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+ __u32 vallen = num_bytes;
+ int rc;
+
+ /* Checks for fiemap flags */
+ if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+ fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+ return -EBADR;
+ }
+
+ /* Check for FIEMAP_FLAG_SYNC */
+ if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+ rc = filemap_fdatawrite(inode->i_mapping);
+ if (rc)
+ return rc;
+ }
+
+ lsm = ccc_inode_lsm_get(inode);
+ if (lsm == NULL)
+ return -ENOENT;
+
+ /* If the stripe_count > 1 and the application does not understand
+ * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+ */
+ if (lsm->lsm_stripe_count > 1 &&
+ !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ fm_key.oa.o_oi = lsm->lsm_oi;
+ fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+ if (i_size_read(inode) == 0) {
+ rc = ll_glimpse_size(inode);
+ if (rc)
+ goto out;
+ }
+
+ obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
+ obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
+ /* If filesize is 0, then there would be no objects for mapping */
+ if (fm_key.oa.o_size == 0) {
+ fiemap->fm_mapped_extents = 0;
+ rc = 0;
+ goto out;
+ }
+
+ memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+ rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
+ fiemap, lsm);
+ if (rc)
+ CERROR("obd_get_info failed: rc = %d\n", rc);
+
+out:
+ ccc_inode_lsm_put(inode, lsm);
+ return rc;
+}
+
+int ll_fid2path(struct inode *inode, void __user *arg)
+{
+ struct obd_export *exp = ll_i2mdexp(inode);
+ const struct getinfo_fid2path __user *gfin = arg;
+ struct getinfo_fid2path *gfout;
+ u32 pathlen;
+ size_t outsize;
+ int rc;
+
+ if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
+ !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+ return -EPERM;
+
+ /* Only need to get the buflen */
+ if (get_user(pathlen, &gfin->gf_pathlen))
+ return -EFAULT;
+
+ if (pathlen > PATH_MAX)
+ return -EINVAL;
+
+ outsize = sizeof(*gfout) + pathlen;
+
+ gfout = kzalloc(outsize, GFP_NOFS);
+ if (!gfout)
+ return -ENOMEM;
+
+ if (copy_from_user(gfout, arg, sizeof(*gfout))) {
+ rc = -EFAULT;
+ goto gf_free;
+ }
+
+ /* Call mdc_iocontrol */
+ rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+ if (rc != 0)
+ goto gf_free;
+
+ if (copy_to_user(arg, gfout, outsize))
+ rc = -EFAULT;
+
+gf_free:
+ OBD_FREE(gfout, outsize);
+ return rc;
+}
+
+static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
+{
+ struct ll_user_fiemap *fiemap_s;
+ size_t num_bytes, ret_bytes;
+ unsigned int extent_count;
+ int rc = 0;
+
+ /* Get the extent count so we can calculate the size of
+ * required fiemap buffer */
+ if (get_user(extent_count,
+ &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+ return -EFAULT;
+
+ if (extent_count >=
+ (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
+ return -EINVAL;
+ num_bytes = sizeof(*fiemap_s) + (extent_count *
+ sizeof(struct ll_fiemap_extent));
+
+ OBD_ALLOC_LARGE(fiemap_s, num_bytes);
+ if (fiemap_s == NULL)
+ return -ENOMEM;
+
+ /* get the fiemap value */
+ if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
+ sizeof(*fiemap_s))) {
+ rc = -EFAULT;
+ goto error;
+ }
+
+ /* If fm_extent_count is non-zero, read the first extent since
+ * it is used to calculate end_offset and device from previous
+ * fiemap call. */
+ if (extent_count) {
+ if (copy_from_user(&fiemap_s->fm_extents[0],
+ (char __user *)arg + sizeof(*fiemap_s),
+ sizeof(struct ll_fiemap_extent))) {
+ rc = -EFAULT;
+ goto error;
+ }
+ }
+
+ rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
+ if (rc)
+ goto error;
+
+ ret_bytes = sizeof(struct ll_user_fiemap);
+
+ if (extent_count != 0)
+ ret_bytes += (fiemap_s->fm_mapped_extents *
+ sizeof(struct ll_fiemap_extent));
+
+ if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+ rc = -EFAULT;
+
+error:
+ OBD_FREE_LARGE(fiemap_s, num_bytes);
+ return rc;
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param extent_lock Take extent lock. Not needed if a process is already
+ * holding the OST object group locks.
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version,
+ int extent_lock)
+{
+ struct lov_stripe_md *lsm = NULL;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct obdo *obdo = NULL;
+ int rc;
+
+ /* If no stripe, we consider version is 0. */
+ lsm = ccc_inode_lsm_get(inode);
+ if (!lsm_has_objects(lsm)) {
+ *data_version = 0;
+ CDEBUG(D_INODE, "No object for inode\n");
+ rc = 0;
+ goto out;
+ }
+
+ obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
+ if (!obdo) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
+ if (rc == 0) {
+ if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
+ rc = -EOPNOTSUPP;
+ else
+ *data_version = obdo->o_data_version;
+ }
+
+ OBD_FREE_PTR(obdo);
+out:
+ ccc_inode_lsm_put(inode, lsm);
+ return rc;
+}
+
+/*
+ * Trigger a HSM release request for the provided inode.
+ */
+int ll_hsm_release(struct inode *inode)
+{
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ struct obd_client_handle *och = NULL;
+ __u64 data_version = 0;
+ int rc;
+
+
+ CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
+ ll_get_fsname(inode->i_sb, NULL, 0),
+ PFID(&ll_i2info(inode)->lli_fid));
+
+ och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
+ if (IS_ERR(och)) {
+ rc = PTR_ERR(och);
+ goto out;
+ }
+
+ /* Grab latest data_version and [am]time values */
+ rc = ll_data_version(inode, &data_version, 1);
+ if (rc != 0)
+ goto out;
+
+ env = cl_env_nested_get(&nest);
+ if (IS_ERR(env)) {
+ rc = PTR_ERR(env);
+ goto out;
+ }
+
+ ll_merge_lvb(env, inode);
+ cl_env_nested_put(&nest, env);
+
+ /* Release the file.
+ * NB: lease lock handle is released in mdc_hsm_release_pack() because
+ * we still need it to pack l_remote_handle to MDT. */
+ rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
+ &data_version);
+ och = NULL;
+
+
+out:
+ if (och != NULL && !IS_ERR(och)) /* close the file */
+ ll_lease_close(och, inode, NULL);
+
+ return rc;
+}
+
+struct ll_swap_stack {
+ struct iattr ia1, ia2;
+ __u64 dv1, dv2;
+ struct inode *inode1, *inode2;
+ bool check_dv1, check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+ struct lustre_swap_layouts *lsl)
+{
+ struct mdc_swap_layouts msl;
+ struct md_op_data *op_data;
+ __u32 gid;
+ __u64 dv;
+ struct ll_swap_stack *llss = NULL;
+ int rc;
+
+ llss = kzalloc(sizeof(*llss), GFP_NOFS);
+ if (!llss)
+ return -ENOMEM;
+
+ llss->inode1 = file_inode(file1);
+ llss->inode2 = file_inode(file2);
+
+ if (!S_ISREG(llss->inode2->i_mode)) {
+ rc = -EINVAL;
+ goto free;
+ }
+
+ if (inode_permission(llss->inode1, MAY_WRITE) ||
+ inode_permission(llss->inode2, MAY_WRITE)) {
+ rc = -EPERM;
+ goto free;
+ }
+
+ if (llss->inode2->i_sb != llss->inode1->i_sb) {
+ rc = -EXDEV;
+ goto free;
+ }
+
+ /* we use 2 bool because it is easier to swap than 2 bits */
+ if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+ llss->check_dv1 = true;
+
+ if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+ llss->check_dv2 = true;
+
+ /* we cannot use lsl->sl_dvX directly because we may swap them */
+ llss->dv1 = lsl->sl_dv1;
+ llss->dv2 = lsl->sl_dv2;
+
+ rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+ if (rc == 0) /* same file, done! */ {
+ rc = 0;
+ goto free;
+ }
+
+ if (rc < 0) { /* sequentialize it */
+ swap(llss->inode1, llss->inode2);
+ swap(file1, file2);
+ swap(llss->dv1, llss->dv2);
+ swap(llss->check_dv1, llss->check_dv2);
+ }
+
+ gid = lsl->sl_gid;
+ if (gid != 0) { /* application asks to flush dirty cache */
+ rc = ll_get_grouplock(llss->inode1, file1, gid);
+ if (rc < 0)
+ goto free;
+
+ rc = ll_get_grouplock(llss->inode2, file2, gid);
+ if (rc < 0) {
+ ll_put_grouplock(llss->inode1, file1, gid);
+ goto free;
+ }
+ }
+
+ /* to be able to restore mtime and atime after swap
+ * we need to first save them */
+ if (lsl->sl_flags &
+ (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
+ llss->ia1.ia_mtime = llss->inode1->i_mtime;
+ llss->ia1.ia_atime = llss->inode1->i_atime;
+ llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
+ llss->ia2.ia_mtime = llss->inode2->i_mtime;
+ llss->ia2.ia_atime = llss->inode2->i_atime;
+ llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
+ }
+
+ /* ultimate check, before swapping the layouts we check if
+ * dataversion has changed (if requested) */
+ if (llss->check_dv1) {
+ rc = ll_data_version(llss->inode1, &dv, 0);
+ if (rc)
+ goto putgl;
+ if (dv != llss->dv1) {
+ rc = -EAGAIN;
+ goto putgl;
+ }
+ }
+
+ if (llss->check_dv2) {
+ rc = ll_data_version(llss->inode2, &dv, 0);
+ if (rc)
+ goto putgl;
+ if (dv != llss->dv2) {
+ rc = -EAGAIN;
+ goto putgl;
+ }
+ }
+
+ /* struct md_op_data is used to send the swap args to the mdt
+ * only flags is missing, so we use struct mdc_swap_layouts
+ * through the md_op_data->op_data */
+ /* flags from user space have to be converted before they are send to
+ * server, no flag is sent today, they are only used on the client */
+ msl.msl_flags = 0;
+ rc = -ENOMEM;
+ op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+ 0, LUSTRE_OPC_ANY, &msl);
+ if (IS_ERR(op_data)) {
+ rc = PTR_ERR(op_data);
+ goto free;
+ }
+
+ rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
+ sizeof(*op_data), op_data, NULL);
+ ll_finish_md_op_data(op_data);
+
+putgl:
+ if (gid != 0) {
+ ll_put_grouplock(llss->inode2, file2, gid);
+ ll_put_grouplock(llss->inode1, file1, gid);
+ }
+
+ /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
+ if (rc != 0)
+ goto free;
+
+ /* clear useless flags */
+ if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
+ llss->ia1.ia_valid &= ~ATTR_MTIME;
+ llss->ia2.ia_valid &= ~ATTR_MTIME;
+ }
+
+ if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
+ llss->ia1.ia_valid &= ~ATTR_ATIME;
+ llss->ia2.ia_valid &= ~ATTR_ATIME;
+ }
+
+ /* update time if requested */
+ rc = 0;
+ if (llss->ia2.ia_valid != 0) {
+ mutex_lock(&llss->inode1->i_mutex);
+ rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
+ mutex_unlock(&llss->inode1->i_mutex);
+ }
+
+ if (llss->ia1.ia_valid != 0) {
+ int rc1;
+
+ mutex_lock(&llss->inode2->i_mutex);
+ rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
+ mutex_unlock(&llss->inode2->i_mutex);
+ if (rc == 0)
+ rc = rc1;
+ }
+
+free:
+ if (llss != NULL)
+ OBD_FREE_PTR(llss);
+
+ return rc;
+}
+
+static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
+{
+ struct md_op_data *op_data;
+ int rc;
+
+ /* Non-root users are forbidden to set or clear flags which are
+ * NOT defined in HSM_USER_MASK. */
+ if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
+ !capable(CFS_CAP_SYS_ADMIN))
+ return -EPERM;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, hss);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
+ sizeof(*op_data), op_data, NULL);
+
+ ll_finish_md_op_data(op_data);
+
+ return rc;
+}
+
+static int ll_hsm_import(struct inode *inode, struct file *file,
+ struct hsm_user_import *hui)
+{
+ struct hsm_state_set *hss = NULL;
+ struct iattr *attr = NULL;
+ int rc;
+
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ /* set HSM flags */
+ hss = kzalloc(sizeof(*hss), GFP_NOFS);
+ if (!hss) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
+ hss->hss_archive_id = hui->hui_archive_id;
+ hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
+ rc = ll_hsm_state_set(inode, hss);
+ if (rc != 0)
+ goto out;
+
+ attr = kzalloc(sizeof(*attr), GFP_NOFS);
+ if (!attr) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+ attr->ia_mode |= S_IFREG;
+ attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
+ attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
+ attr->ia_size = hui->hui_size;
+ attr->ia_mtime.tv_sec = hui->hui_mtime;
+ attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
+ attr->ia_atime.tv_sec = hui->hui_atime;
+ attr->ia_atime.tv_nsec = hui->hui_atime_ns;
+
+ attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
+ ATTR_UID | ATTR_GID |
+ ATTR_MTIME | ATTR_MTIME_SET |
+ ATTR_ATIME | ATTR_ATIME_SET;
+
+ mutex_lock(&inode->i_mutex);
+
+ rc = ll_setattr_raw(file->f_path.dentry, attr, true);
+ if (rc == -ENODATA)
+ rc = 0;
+
+ mutex_unlock(&inode->i_mutex);
+
+out:
+ if (hss != NULL)
+ OBD_FREE_PTR(hss);
+
+ if (attr != NULL)
+ OBD_FREE_PTR(attr);
+
+ return rc;
+}
+
+static long
+ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ int flags, rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
+ inode->i_generation, inode, cmd);
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+ /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+ if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+ return -ENOTTY;
+
+ switch (cmd) {
+ case LL_IOC_GETFLAGS:
+ /* Get the current value of the file flags */
+ return put_user(fd->fd_flags, (int *)arg);
+ case LL_IOC_SETFLAGS:
+ case LL_IOC_CLRFLAGS:
+ /* Set or clear specific file flags */
+ /* XXX This probably needs checks to ensure the flags are
+ * not abused, and to handle any flag side effects.
+ */
+ if (get_user(flags, (int *) arg))
+ return -EFAULT;
+
+ if (cmd == LL_IOC_SETFLAGS) {
+ if ((flags & LL_FILE_IGNORE_LOCK) &&
+ !(file->f_flags & O_DIRECT)) {
+ CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
+ current->comm);
+ return -EINVAL;
+ }
+
+ fd->fd_flags |= flags;
+ } else {
+ fd->fd_flags &= ~flags;
+ }
+ return 0;
+ case LL_IOC_LOV_SETSTRIPE:
+ return ll_lov_setstripe(inode, file, arg);
+ case LL_IOC_LOV_SETEA:
+ return ll_lov_setea(inode, file, arg);
+ case LL_IOC_LOV_SWAP_LAYOUTS: {
+ struct file *file2;
+ struct lustre_swap_layouts lsl;
+
+ if (copy_from_user(&lsl, (char *)arg,
+ sizeof(struct lustre_swap_layouts)))
+ return -EFAULT;
+
+ if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+ return -EPERM;
+
+ file2 = fget(lsl.sl_fd);
+ if (file2 == NULL)
+ return -EBADF;
+
+ rc = -EPERM;
+ if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+ rc = ll_swap_layouts(file, file2, &lsl);
+ fput(file2);
+ return rc;
+ }
+ case LL_IOC_LOV_GETSTRIPE:
+ return ll_lov_getstripe(inode, arg);
+ case LL_IOC_RECREATE_OBJ:
+ return ll_lov_recreate_obj(inode, arg);
+ case LL_IOC_RECREATE_FID:
+ return ll_lov_recreate_fid(inode, arg);
+ case FSFILT_IOC_FIEMAP:
+ return ll_ioctl_fiemap(inode, arg);
+ case FSFILT_IOC_GETFLAGS:
+ case FSFILT_IOC_SETFLAGS:
+ return ll_iocontrol(inode, file, cmd, arg);
+ case FSFILT_IOC_GETVERSION_OLD:
+ case FSFILT_IOC_GETVERSION:
+ return put_user(inode->i_generation, (int *)arg);
+ case LL_IOC_GROUP_LOCK:
+ return ll_get_grouplock(inode, file, arg);
+ case LL_IOC_GROUP_UNLOCK:
+ return ll_put_grouplock(inode, file, arg);
+ case IOC_OBD_STATFS:
+ return ll_obd_statfs(inode, (void *)arg);
+
+ /* We need to special case any other ioctls we want to handle,
+ * to send them to the MDS/OST as appropriate and to properly
+ * network encode the arg field.
+ case FSFILT_IOC_SETVERSION_OLD:
+ case FSFILT_IOC_SETVERSION:
+ */
+ case LL_IOC_FLUSHCTX:
+ return ll_flush_ctx(inode);
+ case LL_IOC_PATH2FID: {
+ if (copy_to_user((void *)arg, ll_inode2fid(inode),
+ sizeof(struct lu_fid)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case OBD_IOC_FID2PATH:
+ return ll_fid2path(inode, (void *)arg);
+ case LL_IOC_DATA_VERSION: {
+ struct ioc_data_version idv;
+ int rc;
+
+ if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
+ return -EFAULT;
+
+ rc = ll_data_version(inode, &idv.idv_version,
+ !(idv.idv_flags & LL_DV_NOFLUSH));
+
+ if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
+ return -EFAULT;
+
+ return rc;
+ }
+
+ case LL_IOC_GET_MDTIDX: {
+ int mdtidx;
+
+ mdtidx = ll_get_mdt_idx(inode);
+ if (mdtidx < 0)
+ return mdtidx;
+
+ if (put_user((int)mdtidx, (int *)arg))
+ return -EFAULT;
+
+ return 0;
+ }
+ case OBD_IOC_GETDTNAME:
+ case OBD_IOC_GETMDNAME:
+ return ll_get_obd_name(inode, cmd, arg);
+ case LL_IOC_HSM_STATE_GET: {
+ struct md_op_data *op_data;
+ struct hsm_user_state *hus;
+ int rc;
+
+ hus = kzalloc(sizeof(*hus), GFP_NOFS);
+ if (!hus)
+ return -ENOMEM;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, hus);
+ if (IS_ERR(op_data)) {
+ OBD_FREE_PTR(hus);
+ return PTR_ERR(op_data);
+ }
+
+ rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+ op_data, NULL);
+
+ if (copy_to_user((void *)arg, hus, sizeof(*hus)))
+ rc = -EFAULT;
+
+ ll_finish_md_op_data(op_data);
+ OBD_FREE_PTR(hus);
+ return rc;
+ }
+ case LL_IOC_HSM_STATE_SET: {
+ struct hsm_state_set *hss;
+ int rc;
+
+ hss = kzalloc(sizeof(*hss), GFP_NOFS);
+ if (!hss)
+ return -ENOMEM;
+
+ if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
+ OBD_FREE_PTR(hss);
+ return -EFAULT;
+ }
+
+ rc = ll_hsm_state_set(inode, hss);
+
+ OBD_FREE_PTR(hss);
+ return rc;
+ }
+ case LL_IOC_HSM_ACTION: {
+ struct md_op_data *op_data;
+ struct hsm_current_action *hca;
+ int rc;
+
+ hca = kzalloc(sizeof(*hca), GFP_NOFS);
+ if (!hca)
+ return -ENOMEM;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, hca);
+ if (IS_ERR(op_data)) {
+ OBD_FREE_PTR(hca);
+ return PTR_ERR(op_data);
+ }
+
+ rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+ op_data, NULL);
+
+ if (copy_to_user((char *)arg, hca, sizeof(*hca)))
+ rc = -EFAULT;
+
+ ll_finish_md_op_data(op_data);
+ OBD_FREE_PTR(hca);
+ return rc;
+ }
+ case LL_IOC_SET_LEASE: {
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_client_handle *och = NULL;
+ bool lease_broken;
+ fmode_t mode = 0;
+
+ switch (arg) {
+ case F_WRLCK:
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ mode = FMODE_WRITE;
+ break;
+ case F_RDLCK:
+ if (!(file->f_mode & FMODE_READ))
+ return -EPERM;
+ mode = FMODE_READ;
+ break;
+ case F_UNLCK:
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_lease_och != NULL) {
+ och = fd->fd_lease_och;
+ fd->fd_lease_och = NULL;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+
+ if (och != NULL) {
+ mode = och->och_flags &
+ (FMODE_READ|FMODE_WRITE);
+ rc = ll_lease_close(och, inode, &lease_broken);
+ if (rc == 0 && lease_broken)
+ mode = 0;
+ } else {
+ rc = -ENOLCK;
+ }
+
+ /* return the type of lease or error */
+ return rc < 0 ? rc : (int)mode;
+ default:
+ return -EINVAL;
+ }
+
+ CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
+
+ /* apply for lease */
+ och = ll_lease_open(inode, file, mode, 0);
+ if (IS_ERR(och))
+ return PTR_ERR(och);
+
+ rc = 0;
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_lease_och == NULL) {
+ fd->fd_lease_och = och;
+ och = NULL;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+ if (och != NULL) {
+ /* impossible now that only excl is supported for now */
+ ll_lease_close(och, inode, &lease_broken);
+ rc = -EBUSY;
+ }
+ return rc;
+ }
+ case LL_IOC_GET_LEASE: {
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ldlm_lock *lock = NULL;
+
+ rc = 0;
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_lease_och != NULL) {
+ struct obd_client_handle *och = fd->fd_lease_och;
+
+ lock = ldlm_handle2lock(&och->och_lease_handle);
+ if (lock != NULL) {
+ lock_res_and_lock(lock);
+ if (!ldlm_is_cancel(lock))
+ rc = och->och_flags &
+ (FMODE_READ | FMODE_WRITE);
+ unlock_res_and_lock(lock);
+ ldlm_lock_put(lock);
+ }
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+ return rc;
+ }
+ case LL_IOC_HSM_IMPORT: {
+ struct hsm_user_import *hui;
+
+ hui = kzalloc(sizeof(*hui), GFP_NOFS);
+ if (!hui)
+ return -ENOMEM;
+
+ if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
+ OBD_FREE_PTR(hui);
+ return -EFAULT;
+ }
+
+ rc = ll_hsm_import(inode, file, hui);
+
+ OBD_FREE_PTR(hui);
+ return rc;
+ }
+ default: {
+ int err;
+
+ if (LLIOC_STOP ==
+ ll_iocontrol_call(inode, file, cmd, arg, &err))
+ return err;
+
+ return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+ (void *)arg);
+ }
+ }
+}
+
+
+static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file_inode(file);
+ loff_t retval, eof = 0;
+
+ retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
+ (origin == SEEK_CUR) ? file->f_pos : 0);
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
+ inode->i_ino, inode->i_generation, inode, retval, retval,
+ origin);
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
+
+ if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
+ retval = ll_glimpse_size(inode);
+ if (retval != 0)
+ return retval;
+ eof = i_size_read(inode);
+ }
+
+ retval = generic_file_llseek_size(file, offset, origin,
+ ll_file_maxbytes(inode), eof);
+ return retval;
+}
+
+static int ll_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ int rc, err;
+
+ LASSERT(!S_ISDIR(inode->i_mode));
+
+ /* catch async errors that were recorded back when async writeback
+ * failed for pages in this mapping. */
+ rc = lli->lli_async_rc;
+ lli->lli_async_rc = 0;
+ err = lov_read_and_clear_async_rc(lli->lli_clob);
+ if (rc == 0)
+ rc = err;
+
+ /* The application has been told write failure already.
+ * Do not report failure again. */
+ if (fd->fd_write_failed)
+ return 0;
+ return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+ enum cl_fsync_mode mode, int ignore_layout)
+{
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ struct cl_io *io;
+ struct obd_capa *capa = NULL;
+ struct cl_fsync_io *fio;
+ int result;
+
+ if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+ mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+ return -EINVAL;
+
+ env = cl_env_nested_get(&nest);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
+
+ io = ccc_env_thread_io(env);
+ io->ci_obj = cl_i2info(inode)->lli_clob;
+ io->ci_ignore_layout = ignore_layout;
+
+ /* initialize parameters for sync */
+ fio = &io->u.ci_fsync;
+ fio->fi_capa = capa;
+ fio->fi_start = start;
+ fio->fi_end = end;
+ fio->fi_fid = ll_inode2fid(inode);
+ fio->fi_mode = mode;
+ fio->fi_nr_written = 0;
+
+ if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+ result = cl_io_loop(env, io);
+ else
+ result = io->ci_result;
+ if (result == 0)
+ result = fio->fi_nr_written;
+ cl_io_fini(env, io);
+ cl_env_nested_put(&nest, env);
+
+ capa_put(capa);
+
+ return result;
+}
+
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ptlrpc_request *req;
+ struct obd_capa *oc;
+ int rc, err;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+ inode->i_generation, inode);
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
+
+ rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ mutex_lock(&inode->i_mutex);
+
+ /* catch async errors that were recorded back when async writeback
+ * failed for pages in this mapping. */
+ if (!S_ISDIR(inode->i_mode)) {
+ err = lli->lli_async_rc;
+ lli->lli_async_rc = 0;
+ if (rc == 0)
+ rc = err;
+ err = lov_read_and_clear_async_rc(lli->lli_clob);
+ if (rc == 0)
+ rc = err;
+ }
+
+ oc = ll_mdscapa_get(inode);
+ err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
+ &req);
+ capa_put(oc);
+ if (!rc)
+ rc = err;
+ if (!err)
+ ptlrpc_req_finished(req);
+
+ if (S_ISREG(inode->i_mode)) {
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+ err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
+ if (rc == 0 && err < 0)
+ rc = err;
+ if (rc < 0)
+ fd->fd_write_failed = true;
+ else
+ fd->fd_write_failed = false;
+ }
+
+ mutex_unlock(&inode->i_mutex);
+ return rc;
+}
+
+static int
+ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ldlm_enqueue_info einfo = {
+ .ei_type = LDLM_FLOCK,
+ .ei_cb_cp = ldlm_flock_completion_ast,
+ .ei_cbdata = file_lock,
+ };
+ struct md_op_data *op_data;
+ struct lustre_handle lockh = {0};
+ ldlm_policy_data_t flock = {{0}};
+ __u64 flags = 0;
+ int rc;
+ int rc2 = 0;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
+ inode->i_ino, file_lock);
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
+
+ if (file_lock->fl_flags & FL_FLOCK)
+ LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+ else if (!(file_lock->fl_flags & FL_POSIX))
+ return -EINVAL;
+
+ flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+ flock.l_flock.pid = file_lock->fl_pid;
+ flock.l_flock.start = file_lock->fl_start;
+ flock.l_flock.end = file_lock->fl_end;
+
+ /* Somewhat ugly workaround for svc lockd.
+ * lockd installs custom fl_lmops->lm_compare_owner that checks
+ * for the fl_owner to be the same (which it always is on local node
+ * I guess between lockd processes) and then compares pid.
+ * As such we assign pid to the owner field to make it all work,
+ * conflict with normal locks is unlikely since pid space and
+ * pointer space for current->files are not intersecting */
+ if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+ flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+
+ switch (file_lock->fl_type) {
+ case F_RDLCK:
+ einfo.ei_mode = LCK_PR;
+ break;
+ case F_UNLCK:
+ /* An unlock request may or may not have any relation to
+ * existing locks so we may not be able to pass a lock handle
+ * via a normal ldlm_lock_cancel() request. The request may even
+ * unlock a byte range in the middle of an existing lock. In
+ * order to process an unlock request we need all of the same
+ * information that is given with a normal read or write record
+ * lock request. To avoid creating another ldlm unlock (cancel)
+ * message we'll treat a LCK_NL flock request as an unlock. */
+ einfo.ei_mode = LCK_NL;
+ break;
+ case F_WRLCK:
+ einfo.ei_mode = LCK_PW;
+ break;
+ default:
+ CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
+ file_lock->fl_type);
+ return -ENOTSUPP;
+ }
+
+ switch (cmd) {
+ case F_SETLKW:
+#ifdef F_SETLKW64
+ case F_SETLKW64:
+#endif
+ flags = 0;
+ break;
+ case F_SETLK:
+#ifdef F_SETLK64
+ case F_SETLK64:
+#endif
+ flags = LDLM_FL_BLOCK_NOWAIT;
+ break;
+ case F_GETLK:
+#ifdef F_GETLK64
+ case F_GETLK64:
+#endif
+ flags = LDLM_FL_TEST_LOCK;
+ /* Save the old mode so that if the mode in the lock changes we
+ * can decrement the appropriate reader or writer refcount. */
+ file_lock->fl_type = einfo.ei_mode;
+ break;
+ default:
+ CERROR("unknown fcntl lock command: %d\n", cmd);
+ return -EINVAL;
+ }
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
+ inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
+ flock.l_flock.start, flock.l_flock.end);
+
+ rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+ op_data, &lockh, &flock, 0, NULL /* req */, flags);
+
+ if ((file_lock->fl_flags & FL_FLOCK) &&
+ (rc == 0 || file_lock->fl_type == F_UNLCK))
+ rc2 = flock_lock_file_wait(file, file_lock);
+ if ((file_lock->fl_flags & FL_POSIX) &&
+ (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+ !(flags & LDLM_FL_TEST_LOCK))
+ rc2 = posix_lock_file_wait(file, file_lock);
+
+ if (rc2 && file_lock->fl_type != F_UNLCK) {
+ einfo.ei_mode = LCK_NL;
+ md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+ op_data, &lockh, &flock, 0, NULL /* req */, flags);
+ rc = rc2;
+ }
+
+ ll_finish_md_op_data(op_data);
+
+ return rc;
+}
+
+static int
+ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+ return -ENOSYS;
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
+{
+ struct lustre_handle lockh;
+ ldlm_policy_data_t policy;
+ ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
+ (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
+ struct lu_fid *fid;
+ __u64 flags;
+ int i;
+
+ if (!inode)
+ return 0;
+
+ fid = &ll_i2info(inode)->lli_fid;
+ CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+ ldlm_lockname[mode]);
+
+ flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+ for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+ policy.l_inodebits.bits = *bits & (1 << i);
+ if (policy.l_inodebits.bits == 0)
+ continue;
+
+ if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+ &policy, mode, &lockh)) {
+ struct ldlm_lock *lock;
+
+ lock = ldlm_handle2lock(&lockh);
+ if (lock) {
+ *bits &=
+ ~(lock->l_policy_data.l_inodebits.bits);
+ LDLM_LOCK_PUT(lock);
+ } else {
+ *bits &= ~policy.l_inodebits.bits;
+ }
+ }
+ }
+ return *bits == 0;
+}
+
+ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+ struct lustre_handle *lockh, __u64 flags,
+ ldlm_mode_t mode)
+{
+ ldlm_policy_data_t policy = { .l_inodebits = {bits} };
+ struct lu_fid *fid;
+ ldlm_mode_t rc;
+
+ fid = &ll_i2info(inode)->lli_fid;
+ CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+ rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+ fid, LDLM_IBITS, &policy, mode, lockh);
+
+ return rc;
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+ /* Already unlinked. Just update nlink and return success */
+ if (rc == -ENOENT) {
+ clear_nlink(inode);
+ /* This path cannot be hit for regular files unless in
+ * case of obscure races, so no need to validate size.
+ */
+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+ return 0;
+ } else if (rc != 0) {
+ CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
+ "%s: revalidate FID "DFID" error: rc = %d\n",
+ ll_get_fsname(inode->i_sb, NULL, 0),
+ PFID(ll_inode2fid(inode)), rc);
+ }
+
+ return rc;
+}
+
+static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ptlrpc_request *req = NULL;
+ struct obd_export *exp;
+ int rc = 0;
+
+ LASSERT(inode != NULL);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
+ inode->i_ino, inode->i_generation, inode, dentry);
+
+ exp = ll_i2mdexp(inode);
+
+ /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+ * But under CMD case, it caused some lock issues, should be fixed
+ * with new CMD ibits lock. See bug 12718 */
+ if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+ struct lookup_intent oit = { .it_op = IT_GETATTR };
+ struct md_op_data *op_data;
+
+ if (ibits == MDS_INODELOCK_LOOKUP)
+ oit.it_op = IT_LOOKUP;
+
+ /* Call getattr by fid, so do not provide name at all. */
+ op_data = ll_prep_md_op_data(NULL, inode,
+ inode, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ oit.it_create_mode |= M_CHECK_STALE;
+ rc = md_intent_lock(exp, op_data, NULL, 0,
+ /* we are not interested in name
+ based lookup */
+ &oit, 0, &req,
+ ll_md_blocking_ast, 0);
+ ll_finish_md_op_data(op_data);
+ oit.it_create_mode &= ~M_CHECK_STALE;
+ if (rc < 0) {
+ rc = ll_inode_revalidate_fini(inode, rc);
+ goto out;
+ }
+
+ rc = ll_revalidate_it_finish(req, &oit, inode);
+ if (rc != 0) {
+ ll_intent_release(&oit);
+ goto out;
+ }
+
+ /* Unlinked? Unhash dentry, so it is not picked up later by
+ do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+ here to preserve get_cwd functionality on 2.6.
+ Bug 10503 */
+ if (!d_inode(dentry)->i_nlink)
+ d_lustre_invalidate(dentry, 0);
+
+ ll_lookup_finish_locks(&oit, inode);
+ } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
+ struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
+ u64 valid = OBD_MD_FLGETATTR;
+ struct md_op_data *op_data;
+ int ealen = 0;
+
+ if (S_ISREG(inode->i_mode)) {
+ rc = ll_get_default_mdsize(sbi, &ealen);
+ if (rc)
+ return rc;
+ valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+ }
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+ 0, ealen, LUSTRE_OPC_ANY,
+ NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ op_data->op_valid = valid;
+ /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
+ * capa for this inode. Because we only keep capas of dirs
+ * fresh. */
+ rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+ ll_finish_md_op_data(op_data);
+ if (rc) {
+ rc = ll_inode_revalidate_fini(inode, rc);
+ return rc;
+ }
+
+ rc = ll_prep_inode(&inode, req, NULL, NULL);
+ }
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
+{
+ struct inode *inode = d_inode(dentry);
+ int rc;
+
+ rc = __ll_inode_revalidate(dentry, ibits);
+ if (rc != 0)
+ return rc;
+
+ /* if object isn't regular file, don't validate size */
+ if (!S_ISREG(inode->i_mode)) {
+ LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
+ LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
+ LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
+ } else {
+ /* In case of restore, the MDT has the right size and has
+ * already send it back without granting the layout lock,
+ * inode is up-to-date so glimpse is useless.
+ * Also to glimpse we need the layout, in case of a running
+ * restore the MDT holds the layout lock so the glimpse will
+ * block up to the end of restore (getattr will block)
+ */
+ if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
+ rc = ll_glimpse_size(inode);
+ }
+ return rc;
+}
+
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+ struct inode *inode = d_inode(de);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ int res = 0;
+
+ res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
+ MDS_INODELOCK_LOOKUP);
+ ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+ if (res)
+ return res;
+
+ stat->dev = inode->i_sb->s_dev;
+ if (ll_need_32bit_api(sbi))
+ stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+ else
+ stat->ino = inode->i_ino;
+ stat->mode = inode->i_mode;
+ stat->nlink = inode->i_nlink;
+ stat->uid = inode->i_uid;
+ stat->gid = inode->i_gid;
+ stat->rdev = inode->i_rdev;
+ stat->atime = inode->i_atime;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
+ stat->blksize = 1 << inode->i_blkbits;
+
+ stat->size = i_size_read(inode);
+ stat->blocks = inode->i_blocks;
+
+ return 0;
+}
+
+static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ int rc;
+ size_t num_bytes;
+ struct ll_user_fiemap *fiemap;
+ unsigned int extent_count = fieinfo->fi_extents_max;
+
+ num_bytes = sizeof(*fiemap) + (extent_count *
+ sizeof(struct ll_fiemap_extent));
+ OBD_ALLOC_LARGE(fiemap, num_bytes);
+
+ if (fiemap == NULL)
+ return -ENOMEM;
+
+ fiemap->fm_flags = fieinfo->fi_flags;
+ fiemap->fm_extent_count = fieinfo->fi_extents_max;
+ fiemap->fm_start = start;
+ fiemap->fm_length = len;
+ if (extent_count > 0)
+ memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
+ sizeof(struct ll_fiemap_extent));
+
+ rc = ll_do_fiemap(inode, fiemap, num_bytes);
+
+ fieinfo->fi_flags = fiemap->fm_flags;
+ fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
+ if (extent_count > 0)
+ memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
+ fiemap->fm_mapped_extents *
+ sizeof(struct ll_fiemap_extent));
+
+ OBD_FREE_LARGE(fiemap, num_bytes);
+ return rc;
+}
+
+struct posix_acl *ll_get_acl(struct inode *inode, int type)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct posix_acl *acl = NULL;
+
+ spin_lock(&lli->lli_lock);
+ /* VFS' acl_permission_check->check_acl will release the refcount */
+ acl = posix_acl_dup(lli->lli_posix_acl);
+ spin_unlock(&lli->lli_lock);
+
+ return acl;
+}
+
+
+int ll_inode_permission(struct inode *inode, int mask)
+{
+ int rc = 0;
+
+#ifdef MAY_NOT_BLOCK
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+#endif
+
+ /* as root inode are NOT getting validated in lookup operation,
+ * need to do it before permission check. */
+
+ if (is_root_inode(inode)) {
+ rc = __ll_inode_revalidate(inode->i_sb->s_root,
+ MDS_INODELOCK_LOOKUP);
+ if (rc)
+ return rc;
+ }
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
+ inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
+
+ if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
+ return lustre_check_remote_perm(inode, mask);
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
+ rc = generic_permission(inode, mask);
+
+ return rc;
+}
+
+/* -o localflock - only provides locally consistent flock locks */
+struct file_operations ll_file_operations = {
+ .read_iter = ll_file_read_iter,
+ .write_iter = ll_file_write_iter,
+ .unlocked_ioctl = ll_file_ioctl,
+ .open = ll_file_open,
+ .release = ll_file_release,
+ .mmap = ll_file_mmap,
+ .llseek = ll_file_seek,
+ .splice_read = ll_file_splice_read,
+ .fsync = ll_fsync,
+ .flush = ll_flush
+};
+
+struct file_operations ll_file_operations_flock = {
+ .read_iter = ll_file_read_iter,
+ .write_iter = ll_file_write_iter,
+ .unlocked_ioctl = ll_file_ioctl,
+ .open = ll_file_open,
+ .release = ll_file_release,
+ .mmap = ll_file_mmap,
+ .llseek = ll_file_seek,
+ .splice_read = ll_file_splice_read,
+ .fsync = ll_fsync,
+ .flush = ll_flush,
+ .flock = ll_file_flock,
+ .lock = ll_file_flock
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+struct file_operations ll_file_operations_noflock = {
+ .read_iter = ll_file_read_iter,
+ .write_iter = ll_file_write_iter,
+ .unlocked_ioctl = ll_file_ioctl,
+ .open = ll_file_open,
+ .release = ll_file_release,
+ .mmap = ll_file_mmap,
+ .llseek = ll_file_seek,
+ .splice_read = ll_file_splice_read,
+ .fsync = ll_fsync,
+ .flush = ll_flush,
+ .flock = ll_file_noflock,
+ .lock = ll_file_noflock
+};
+
+struct inode_operations ll_file_inode_operations = {
+ .setattr = ll_setattr,
+ .getattr = ll_getattr,
+ .permission = ll_inode_permission,
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+ .fiemap = ll_fiemap,
+ .get_acl = ll_get_acl,
+};
+
+/* dynamic ioctl number support routines */
+static struct llioc_ctl_data {
+ struct rw_semaphore ioc_sem;
+ struct list_head ioc_head;
+} llioc = {
+ __RWSEM_INITIALIZER(llioc.ioc_sem),
+ LIST_HEAD_INIT(llioc.ioc_head)
+};
+
+
+struct llioc_data {
+ struct list_head iocd_list;
+ unsigned int iocd_size;
+ llioc_callback_t iocd_cb;
+ unsigned int iocd_count;
+ unsigned int iocd_cmd[0];
+};
+
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
+{
+ unsigned int size;
+ struct llioc_data *in_data = NULL;
+
+ if (cb == NULL || cmd == NULL ||
+ count > LLIOC_MAX_CMD || count < 0)
+ return NULL;
+
+ size = sizeof(*in_data) + count * sizeof(unsigned int);
+ in_data = kzalloc(size, GFP_NOFS);
+ if (!in_data)
+ return NULL;
+
+ memset(in_data, 0, sizeof(*in_data));
+ in_data->iocd_size = size;
+ in_data->iocd_cb = cb;
+ in_data->iocd_count = count;
+ memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
+
+ down_write(&llioc.ioc_sem);
+ list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
+ up_write(&llioc.ioc_sem);
+
+ return in_data;
+}
+EXPORT_SYMBOL(ll_iocontrol_register);
+
+void ll_iocontrol_unregister(void *magic)
+{
+ struct llioc_data *tmp;
+
+ if (magic == NULL)
+ return;
+
+ down_write(&llioc.ioc_sem);
+ list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
+ if (tmp == magic) {
+ unsigned int size = tmp->iocd_size;
+
+ list_del(&tmp->iocd_list);
+ up_write(&llioc.ioc_sem);
+
+ OBD_FREE(tmp, size);
+ return;
+ }
+ }
+ up_write(&llioc.ioc_sem);
+
+ CWARN("didn't find iocontrol register block with magic: %p\n", magic);
+}
+EXPORT_SYMBOL(ll_iocontrol_unregister);
+
+static enum llioc_iter
+ll_iocontrol_call(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg, int *rcp)
+{
+ enum llioc_iter ret = LLIOC_CONT;
+ struct llioc_data *data;
+ int rc = -EINVAL, i;
+
+ down_read(&llioc.ioc_sem);
+ list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
+ for (i = 0; i < data->iocd_count; i++) {
+ if (cmd != data->iocd_cmd[i])
+ continue;
+
+ ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
+ break;
+ }
+
+ if (ret == LLIOC_STOP)
+ break;
+ }
+ up_read(&llioc.ioc_sem);
+
+ if (rcp)
+ *rcp = rc;
+ return ret;
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ int result;
+
+ if (lli->lli_clob == NULL)
+ return 0;
+
+ env = cl_env_nested_get(&nest);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ result = cl_conf_set(env, lli->lli_clob, conf);
+ cl_env_nested_put(&nest, env);
+
+ if (conf->coc_opc == OBJECT_CONF_SET) {
+ struct ldlm_lock *lock = conf->coc_lock;
+
+ LASSERT(lock != NULL);
+ LASSERT(ldlm_has_layout(lock));
+ if (result == 0) {
+ /* it can only be allowed to match after layout is
+ * applied to inode otherwise false layout would be
+ * seen. Applying layout should happen before dropping
+ * the intent lock. */
+ ldlm_lock_allow_match(lock);
+ }
+ }
+ return result;
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct obd_capa *oc;
+ struct ptlrpc_request *req;
+ struct mdt_body *body;
+ void *lvbdata;
+ void *lmm;
+ int lmmsize;
+ int rc;
+
+ CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
+ PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
+ lock->l_lvb_data, lock->l_lvb_len);
+
+ if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
+ return 0;
+
+ /* if layout lock was granted right away, the layout is returned
+ * within DLM_LVB of dlm reply; otherwise if the lock was ever
+ * blocked and then granted via completion ast, we have to fetch
+ * layout here. Please note that we can't use the LVB buffer in
+ * completion AST because it doesn't have a large enough buffer */
+ oc = ll_mdscapa_get(inode);
+ rc = ll_get_default_mdsize(sbi, &lmmsize);
+ if (rc == 0)
+ rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+ OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+ lmmsize, 0, &req);
+ capa_put(oc);
+ if (rc < 0)
+ return rc;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ lmmsize = body->eadatasize;
+ if (lmmsize == 0) /* empty layout */ {
+ rc = 0;
+ goto out;
+ }
+
+ lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+ if (lmm == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ OBD_ALLOC_LARGE(lvbdata, lmmsize);
+ if (lvbdata == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(lvbdata, lmm, lmmsize);
+ lock_res_and_lock(lock);
+ if (lock->l_lvb_data != NULL)
+ OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
+
+ lock->l_lvb_data = lvbdata;
+ lock->l_lvb_len = lmmsize;
+ unlock_res_and_lock(lock);
+
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
+ struct inode *inode, __u32 *gen, bool reconf)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ldlm_lock *lock;
+ struct lustre_md md = { NULL };
+ struct cl_object_conf conf;
+ int rc = 0;
+ bool lvb_ready;
+ bool wait_layout = false;
+
+ LASSERT(lustre_handle_is_used(lockh));
+
+ lock = ldlm_handle2lock(lockh);
+ LASSERT(lock != NULL);
+ LASSERT(ldlm_has_layout(lock));
+
+ LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
+ inode, PFID(&lli->lli_fid), reconf);
+
+ /* in case this is a caching lock and reinstate with new inode */
+ md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
+
+ lock_res_and_lock(lock);
+ lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
+ unlock_res_and_lock(lock);
+ /* checking lvb_ready is racy but this is okay. The worst case is
+ * that multi processes may configure the file on the same time. */
+ if (lvb_ready || !reconf) {
+ rc = -ENODATA;
+ if (lvb_ready) {
+ /* layout_gen must be valid if layout lock is not
+ * cancelled and stripe has already set */
+ *gen = ll_layout_version_get(lli);
+ rc = 0;
+ }
+ goto out;
+ }
+
+ rc = ll_layout_fetch(inode, lock);
+ if (rc < 0)
+ goto out;
+
+ /* for layout lock, lmm is returned in lock's lvb.
+ * lvb_data is immutable if the lock is held so it's safe to access it
+ * without res lock. See the description in ldlm_lock_decref_internal()
+ * for the condition to free lvb_data of layout lock */
+ if (lock->l_lvb_data != NULL) {
+ rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
+ lock->l_lvb_data, lock->l_lvb_len);
+ if (rc >= 0) {
+ *gen = LL_LAYOUT_GEN_EMPTY;
+ if (md.lsm != NULL)
+ *gen = md.lsm->lsm_layout_gen;
+ rc = 0;
+ } else {
+ CERROR("%s: file "DFID" unpackmd error: %d\n",
+ ll_get_fsname(inode->i_sb, NULL, 0),
+ PFID(&lli->lli_fid), rc);
+ }
+ }
+ if (rc < 0)
+ goto out;
+
+ /* set layout to file. Unlikely this will fail as old layout was
+ * surely eliminated */
+ memset(&conf, 0, sizeof(conf));
+ conf.coc_opc = OBJECT_CONF_SET;
+ conf.coc_inode = inode;
+ conf.coc_lock = lock;
+ conf.u.coc_md = &md;
+ rc = ll_layout_conf(inode, &conf);
+
+ if (md.lsm != NULL)
+ obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+
+ /* refresh layout failed, need to wait */
+ wait_layout = rc == -EBUSY;
+
+out:
+ LDLM_LOCK_PUT(lock);
+ ldlm_lock_decref(lockh, mode);
+
+ /* wait for IO to complete if it's still being used. */
+ if (wait_layout) {
+ CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
+ ll_get_fsname(inode->i_sb, NULL, 0),
+ inode, PFID(&lli->lli_fid));
+
+ memset(&conf, 0, sizeof(conf));
+ conf.coc_opc = OBJECT_CONF_WAIT;
+ conf.coc_inode = inode;
+ rc = ll_layout_conf(inode, &conf);
+ if (rc == 0)
+ rc = -EAGAIN;
+
+ CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
+ PFID(&lli->lli_fid), rc);
+ }
+ return rc;
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct md_op_data *op_data;
+ struct lookup_intent it;
+ struct lustre_handle lockh;
+ ldlm_mode_t mode;
+ struct ldlm_enqueue_info einfo = {
+ .ei_type = LDLM_IBITS,
+ .ei_mode = LCK_CR,
+ .ei_cb_bl = ll_md_blocking_ast,
+ .ei_cb_cp = ldlm_completion_ast,
+ };
+ int rc;
+
+ *gen = ll_layout_version_get(lli);
+ if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
+ return 0;
+
+ /* sanity checks */
+ LASSERT(fid_is_sane(ll_inode2fid(inode)));
+ LASSERT(S_ISREG(inode->i_mode));
+
+ /* take layout lock mutex to enqueue layout lock exclusively. */
+ mutex_lock(&lli->lli_layout_mutex);
+
+again:
+ /* mostly layout lock is caching on the local side, so try to match
+ * it before grabbing layout lock mutex. */
+ mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
+ LCK_CR | LCK_CW | LCK_PR | LCK_PW);
+ if (mode != 0) { /* hit cached lock */
+ rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+ if (rc == -EAGAIN)
+ goto again;
+
+ mutex_unlock(&lli->lli_layout_mutex);
+ return rc;
+ }
+
+ op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+ 0, 0, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data)) {
+ mutex_unlock(&lli->lli_layout_mutex);
+ return PTR_ERR(op_data);
+ }
+
+ /* have to enqueue one */
+ memset(&it, 0, sizeof(it));
+ it.it_op = IT_LAYOUT;
+ lockh.cookie = 0ULL;
+
+ LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
+ ll_get_fsname(inode->i_sb, NULL, 0), inode,
+ PFID(&lli->lli_fid));
+
+ rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
+ NULL, 0, NULL, 0);
+ if (it.d.lustre.it_data != NULL)
+ ptlrpc_req_finished(it.d.lustre.it_data);
+ it.d.lustre.it_data = NULL;
+
+ ll_finish_md_op_data(op_data);
+
+ mode = it.d.lustre.it_lock_mode;
+ it.d.lustre.it_lock_mode = 0;
+ ll_intent_drop_lock(&it);
+
+ if (rc == 0) {
+ /* set lock data in case this is a new lock */
+ ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+ rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+ if (rc == -EAGAIN)
+ goto again;
+ }
+ mutex_unlock(&lli->lli_layout_mutex);
+
+ return rc;
+}
+
+/**
+ * This function send a restore request to the MDT
+ */
+int ll_layout_restore(struct inode *inode)
+{
+ struct hsm_user_request *hur;
+ int len, rc;
+
+ len = sizeof(struct hsm_user_request) +
+ sizeof(struct hsm_user_item);
+ hur = kzalloc(len, GFP_NOFS);
+ if (!hur)
+ return -ENOMEM;
+
+ hur->hur_request.hr_action = HUA_RESTORE;
+ hur->hur_request.hr_archive_id = 0;
+ hur->hur_request.hr_flags = 0;
+ memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
+ sizeof(hur->hur_user_item[0].hui_fid));
+ hur->hur_user_item[0].hui_extent.length = -1;
+ hur->hur_request.hr_itemcount = 1;
+ rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
+ len, hur, NULL);
+ OBD_FREE(hur, len);
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_capa.c b/drivers/staging/lustre/lustre/llite/llite_capa.c
new file mode 100644
index 000000000..aec9a4412
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_capa.c
@@ -0,0 +1,654 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_capa.c
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/file.h>
+#include <linux/kmod.h>
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+/* for obd_capa.c_list, client capa might stay in three places:
+ * 1. ll_capa_list.
+ * 2. ll_idle_capas.
+ * 3. stand alone: just allocated.
+ */
+
+/* capas for oss writeback and those failed to renew */
+static LIST_HEAD(ll_idle_capas);
+static struct ptlrpc_thread ll_capa_thread;
+static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
+
+/* llite capa renewal timer */
+struct timer_list ll_capa_timer;
+/* for debug: indicate whether capa on llite is enabled or not */
+static atomic_t ll_capa_debug = ATOMIC_INIT(0);
+static unsigned long long ll_capa_renewed;
+static unsigned long long ll_capa_renewal_noent;
+static unsigned long long ll_capa_renewal_failed;
+static unsigned long long ll_capa_renewal_retries;
+
+static int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
+
+static inline void update_capa_timer(struct obd_capa *ocapa, unsigned long expiry)
+{
+ if (time_before(expiry, ll_capa_timer.expires) ||
+ !timer_pending(&ll_capa_timer)) {
+ mod_timer(&ll_capa_timer, expiry);
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+ "ll_capa_timer update: %lu/%lu by", expiry, jiffies);
+ }
+}
+
+static inline unsigned long capa_renewal_time(struct obd_capa *ocapa)
+{
+ return cfs_time_sub(ocapa->c_expiry,
+ cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2);
+}
+
+static inline int capa_is_to_expire(struct obd_capa *ocapa)
+{
+ return time_before_eq(capa_renewal_time(ocapa), cfs_time_current());
+}
+
+static inline int have_expired_capa(void)
+{
+ struct obd_capa *ocapa = NULL;
+ int expired = 0;
+
+ /* if ll_capa_list has client capa to expire or ll_idle_capas has
+ * expired capa, return 1.
+ */
+ spin_lock(&capa_lock);
+ if (!list_empty(ll_capa_list)) {
+ ocapa = list_entry(ll_capa_list->next, struct obd_capa,
+ c_list);
+ expired = capa_is_to_expire(ocapa);
+ if (!expired)
+ update_capa_timer(ocapa, capa_renewal_time(ocapa));
+ } else if (!list_empty(&ll_idle_capas)) {
+ ocapa = list_entry(ll_idle_capas.next, struct obd_capa,
+ c_list);
+ expired = capa_is_expired(ocapa);
+ if (!expired)
+ update_capa_timer(ocapa, ocapa->c_expiry);
+ }
+ spin_unlock(&capa_lock);
+
+ if (expired)
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
+ return expired;
+}
+
+static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
+{
+ struct obd_capa *tmp;
+ struct list_head *before = NULL;
+
+ /* TODO: client capa is sorted by expiry, this could be optimized */
+ list_for_each_entry_reverse(tmp, head, c_list) {
+ if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) {
+ before = &tmp->c_list;
+ break;
+ }
+ }
+
+ LASSERT(&ocapa->c_list != before);
+ list_add(&ocapa->c_list, before ?: head);
+}
+
+static inline int obd_capa_open_count(struct obd_capa *oc)
+{
+ struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
+ return atomic_read(&lli->lli_open_count);
+}
+
+static void ll_delete_capa(struct obd_capa *ocapa)
+{
+ struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
+
+ if (capa_for_mds(&ocapa->c_capa)) {
+ LASSERT(lli->lli_mds_capa == ocapa);
+ lli->lli_mds_capa = NULL;
+ } else if (capa_for_oss(&ocapa->c_capa)) {
+ list_del_init(&ocapa->u.cli.lli_list);
+ }
+
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
+ list_del_init(&ocapa->c_list);
+ capa_count[CAPA_SITE_CLIENT]--;
+ /* release the ref when alloc */
+ capa_put(ocapa);
+}
+
+/* three places where client capa is deleted:
+ * 1. capa_thread_main(), main place to delete expired capa.
+ * 2. ll_clear_inode_capas() in ll_clear_inode().
+ * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost().
+ */
+static int capa_thread_main(void *unused)
+{
+ struct obd_capa *ocapa, *tmp, *next;
+ struct inode *inode = NULL;
+ struct l_wait_info lwi = { 0 };
+ int rc;
+
+ thread_set_flags(&ll_capa_thread, SVC_RUNNING);
+ wake_up(&ll_capa_thread.t_ctl_waitq);
+
+ while (1) {
+ l_wait_event(ll_capa_thread.t_ctl_waitq,
+ !thread_is_running(&ll_capa_thread) ||
+ have_expired_capa(),
+ &lwi);
+
+ if (!thread_is_running(&ll_capa_thread))
+ break;
+
+ next = NULL;
+
+ spin_lock(&capa_lock);
+ list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
+ __u64 ibits;
+
+ LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
+
+ if (!capa_is_to_expire(ocapa)) {
+ next = ocapa;
+ break;
+ }
+
+ list_del_init(&ocapa->c_list);
+
+ /* for MDS capability, only renew those which belong to
+ * dir, or its inode is opened, or client holds LOOKUP
+ * lock.
+ */
+ /* ibits may be changed by ll_have_md_lock() so we have
+ * to set it each time */
+ ibits = MDS_INODELOCK_LOOKUP;
+ if (capa_for_mds(&ocapa->c_capa) &&
+ !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
+ obd_capa_open_count(ocapa) == 0 &&
+ !ll_have_md_lock(ocapa->u.cli.inode,
+ &ibits, LCK_MINMODE)) {
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+ "skip renewal for");
+ sort_add_capa(ocapa, &ll_idle_capas);
+ continue;
+ }
+
+ /* for OSS capability, only renew those whose inode is
+ * opened.
+ */
+ if (capa_for_oss(&ocapa->c_capa) &&
+ obd_capa_open_count(ocapa) == 0) {
+ /* oss capa with open count == 0 won't renew,
+ * move to idle list */
+ sort_add_capa(ocapa, &ll_idle_capas);
+ continue;
+ }
+
+ /* NB iput() is in ll_update_capa() */
+ inode = igrab(ocapa->u.cli.inode);
+ if (inode == NULL) {
+ DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+ "igrab failed for");
+ continue;
+ }
+
+ capa_get(ocapa);
+ ll_capa_renewed++;
+ spin_unlock(&capa_lock);
+ rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
+ ll_update_capa);
+ spin_lock(&capa_lock);
+ if (rc) {
+ DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+ "renew failed: %d", rc);
+ ll_capa_renewal_failed++;
+ }
+ }
+
+ if (next)
+ update_capa_timer(next, capa_renewal_time(next));
+
+ list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas,
+ c_list) {
+ if (!capa_is_expired(ocapa)) {
+ if (!next)
+ update_capa_timer(ocapa,
+ ocapa->c_expiry);
+ break;
+ }
+
+ if (atomic_read(&ocapa->c_refc) > 1) {
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+ "expired(c_refc %d), don't release",
+ atomic_read(&ocapa->c_refc));
+ /* don't try to renew any more */
+ list_del_init(&ocapa->c_list);
+ continue;
+ }
+
+ /* expired capa is released. */
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
+ ll_delete_capa(ocapa);
+ }
+
+ spin_unlock(&capa_lock);
+ }
+
+ thread_set_flags(&ll_capa_thread, SVC_STOPPED);
+ wake_up(&ll_capa_thread.t_ctl_waitq);
+ return 0;
+}
+
+void ll_capa_timer_callback(unsigned long unused)
+{
+ wake_up(&ll_capa_thread.t_ctl_waitq);
+}
+
+int ll_capa_thread_start(void)
+{
+ struct task_struct *task;
+
+ init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
+
+ task = kthread_run(capa_thread_main, NULL, "ll_capa");
+ if (IS_ERR(task)) {
+ CERROR("cannot start expired capa thread: rc %ld\n",
+ PTR_ERR(task));
+ return PTR_ERR(task);
+ }
+ wait_event(ll_capa_thread.t_ctl_waitq,
+ thread_is_running(&ll_capa_thread));
+
+ return 0;
+}
+
+void ll_capa_thread_stop(void)
+{
+ thread_set_flags(&ll_capa_thread, SVC_STOPPING);
+ wake_up(&ll_capa_thread.t_ctl_waitq);
+ wait_event(ll_capa_thread.t_ctl_waitq,
+ thread_is_stopped(&ll_capa_thread));
+}
+
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_capa *ocapa;
+ int found = 0;
+
+ if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
+ return NULL;
+
+ LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
+ opc == CAPA_OPC_OSS_TRUNC);
+
+ spin_lock(&capa_lock);
+ list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+ if (capa_is_expired(ocapa))
+ continue;
+ if ((opc & CAPA_OPC_OSS_WRITE) &&
+ capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
+ found = 1;
+ break;
+ } else if ((opc & CAPA_OPC_OSS_READ) &&
+ capa_opc_supported(&ocapa->c_capa,
+ CAPA_OPC_OSS_READ)) {
+ found = 1;
+ break;
+ } else if ((opc & CAPA_OPC_OSS_TRUNC) &&
+ capa_opc_supported(&ocapa->c_capa, opc)) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+ ll_inode2fid(inode)));
+ LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+ capa_get(ocapa);
+
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+ } else {
+ ocapa = NULL;
+
+ if (atomic_read(&ll_capa_debug)) {
+ CERROR("no capability for "DFID" opc %#llx\n",
+ PFID(&lli->lli_fid), opc);
+ atomic_set(&ll_capa_debug, 0);
+ }
+ }
+ spin_unlock(&capa_lock);
+
+ return ocapa;
+}
+EXPORT_SYMBOL(ll_osscapa_get);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_capa *ocapa;
+
+ LASSERT(inode != NULL);
+
+ if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
+ return NULL;
+
+ spin_lock(&capa_lock);
+ ocapa = capa_get(lli->lli_mds_capa);
+ spin_unlock(&capa_lock);
+ if (!ocapa && atomic_read(&ll_capa_debug)) {
+ CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid));
+ atomic_set(&ll_capa_debug, 0);
+ }
+
+ return ocapa;
+}
+
+static struct obd_capa *do_add_mds_capa(struct inode *inode,
+ struct obd_capa *ocapa)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_capa *old = lli->lli_mds_capa;
+ struct lustre_capa *capa = &ocapa->c_capa;
+
+ if (!old) {
+ ocapa->u.cli.inode = inode;
+ lli->lli_mds_capa = ocapa;
+ capa_count[CAPA_SITE_CLIENT]++;
+
+ DEBUG_CAPA(D_SEC, capa, "add MDS");
+ } else {
+ spin_lock(&old->c_lock);
+ old->c_capa = *capa;
+ spin_unlock(&old->c_lock);
+
+ DEBUG_CAPA(D_SEC, capa, "update MDS");
+
+ capa_put(ocapa);
+ ocapa = old;
+ }
+ return ocapa;
+}
+
+static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_capa *ocapa;
+
+ /* inside capa_lock */
+ list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+ if ((capa_opc(&ocapa->c_capa) & opc) != opc)
+ continue;
+
+ LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+ ll_inode2fid(inode)));
+ LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+ return ocapa;
+ }
+
+ return NULL;
+}
+
+static inline void inode_add_oss_capa(struct inode *inode,
+ struct obd_capa *ocapa)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_capa *tmp;
+ struct list_head *next = NULL;
+
+ /* capa is sorted in lli_oss_capas so lookup can always find the
+ * latest one */
+ list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
+ if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
+ next = &tmp->u.cli.lli_list;
+ break;
+ }
+ }
+ LASSERT(&ocapa->u.cli.lli_list != next);
+ list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
+}
+
+static struct obd_capa *do_add_oss_capa(struct inode *inode,
+ struct obd_capa *ocapa)
+{
+ struct obd_capa *old;
+ struct lustre_capa *capa = &ocapa->c_capa;
+
+ LASSERTF(S_ISREG(inode->i_mode),
+ "inode has oss capa, but not regular file, mode: %d\n",
+ inode->i_mode);
+
+ /* FIXME: can't replace it so easily with fine-grained opc */
+ old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY);
+ if (!old) {
+ ocapa->u.cli.inode = inode;
+ INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+ capa_count[CAPA_SITE_CLIENT]++;
+
+ DEBUG_CAPA(D_SEC, capa, "add OSS");
+ } else {
+ spin_lock(&old->c_lock);
+ old->c_capa = *capa;
+ spin_unlock(&old->c_lock);
+
+ DEBUG_CAPA(D_SEC, capa, "update OSS");
+
+ capa_put(ocapa);
+ ocapa = old;
+ }
+
+ inode_add_oss_capa(inode, ocapa);
+ return ocapa;
+}
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
+{
+ spin_lock(&capa_lock);
+ ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) :
+ do_add_oss_capa(inode, ocapa);
+
+ /* truncate capa won't renew */
+ if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
+ set_capa_expiry(ocapa);
+ list_del_init(&ocapa->c_list);
+ sort_add_capa(ocapa, ll_capa_list);
+
+ update_capa_timer(ocapa, capa_renewal_time(ocapa));
+ }
+
+ spin_unlock(&capa_lock);
+
+ atomic_set(&ll_capa_debug, 1);
+ return ocapa;
+}
+
+static inline void delay_capa_renew(struct obd_capa *oc, unsigned long delay)
+{
+ /* NB: set a fake expiry for this capa to prevent it renew too soon */
+ oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay));
+}
+
+static int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
+{
+ struct inode *inode = ocapa->u.cli.inode;
+ int rc = 0;
+
+ LASSERT(ocapa);
+
+ if (IS_ERR(capa)) {
+ /* set error code */
+ rc = PTR_ERR(capa);
+ spin_lock(&capa_lock);
+ if (rc == -ENOENT) {
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+ "renewal canceled because object removed");
+ ll_capa_renewal_noent++;
+ } else {
+ ll_capa_renewal_failed++;
+
+ /* failed capa won't be renewed any longer, but if -EIO,
+ * client might be doing recovery, retry in 2 min. */
+ if (rc == -EIO && !capa_is_expired(ocapa)) {
+ delay_capa_renew(ocapa, 120);
+ DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+ "renewal failed: -EIO, retry in 2 mins");
+ ll_capa_renewal_retries++;
+ goto retry;
+ } else {
+ DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+ "renewal failed(rc: %d) for", rc);
+ }
+ }
+
+ list_del_init(&ocapa->c_list);
+ sort_add_capa(ocapa, &ll_idle_capas);
+ spin_unlock(&capa_lock);
+
+ capa_put(ocapa);
+ iput(inode);
+ return rc;
+ }
+
+ spin_lock(&ocapa->c_lock);
+ LASSERT(!memcmp(&ocapa->c_capa, capa,
+ offsetof(struct lustre_capa, lc_opc)));
+ ocapa->c_capa = *capa;
+ set_capa_expiry(ocapa);
+ spin_unlock(&ocapa->c_lock);
+
+ spin_lock(&capa_lock);
+ if (capa_for_oss(capa))
+ inode_add_oss_capa(inode, ocapa);
+ DEBUG_CAPA(D_SEC, capa, "renew");
+retry:
+ list_del_init(&ocapa->c_list);
+ sort_add_capa(ocapa, ll_capa_list);
+ update_capa_timer(ocapa, capa_renewal_time(ocapa));
+ spin_unlock(&capa_lock);
+
+ capa_put(ocapa);
+ iput(inode);
+ return rc;
+}
+
+void ll_capa_open(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+ == 0)
+ return;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ atomic_inc(&lli->lli_open_count);
+}
+
+void ll_capa_close(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+ == 0)
+ return;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ atomic_dec(&lli->lli_open_count);
+}
+
+/* delete CAPA_OPC_OSS_TRUNC only */
+void ll_truncate_free_capa(struct obd_capa *ocapa)
+{
+ if (!ocapa)
+ return;
+
+ LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate");
+
+ /* release ref when find */
+ capa_put(ocapa);
+ if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) {
+ spin_lock(&capa_lock);
+ ll_delete_capa(ocapa);
+ spin_unlock(&capa_lock);
+ }
+}
+
+void ll_clear_inode_capas(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_capa *ocapa, *tmp;
+
+ spin_lock(&capa_lock);
+ ocapa = lli->lli_mds_capa;
+ if (ocapa)
+ ll_delete_capa(ocapa);
+
+ list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
+ u.cli.lli_list)
+ ll_delete_capa(ocapa);
+ spin_unlock(&capa_lock);
+}
+
+void ll_print_capa_stat(struct ll_sb_info *sbi)
+{
+ if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+ LCONSOLE_INFO("Fid capabilities renewed: %llu\n"
+ "Fid capabilities renewal ENOENT: %llu\n"
+ "Fid capabilities failed to renew: %llu\n"
+ "Fid capabilities renewal retries: %llu\n",
+ ll_capa_renewed, ll_capa_renewal_noent,
+ ll_capa_renewal_failed, ll_capa_renewal_retries);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_close.c b/drivers/staging/lustre/lustre/llite/llite_close.c
new file mode 100644
index 000000000..a94ba02cc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_close.c
@@ -0,0 +1,393 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_close.c
+ *
+ * Lustre Lite routines to issue a secondary close after writeback
+ */
+
+#include <linux/module.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+/** records that a write is in flight */
+void vvp_write_pending(struct ccc_object *club, struct ccc_page *page)
+{
+ struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags |= LLIF_SOM_DIRTY;
+ if (page != NULL && list_empty(&page->cpg_pending_linkage))
+ list_add(&page->cpg_pending_linkage,
+ &club->cob_pending_list);
+ spin_unlock(&lli->lli_lock);
+}
+
+/** records that a write has completed */
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
+{
+ struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+ int rc = 0;
+
+ spin_lock(&lli->lli_lock);
+ if (page != NULL && !list_empty(&page->cpg_pending_linkage)) {
+ list_del_init(&page->cpg_pending_linkage);
+ rc = 1;
+ }
+ spin_unlock(&lli->lli_lock);
+ if (rc)
+ ll_queue_done_writing(club->cob_inode, 0);
+}
+
+/** Queues DONE_WRITING if
+ * - done writing is allowed;
+ * - inode has no no dirty pages; */
+void ll_queue_done_writing(struct inode *inode, unsigned long flags)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags |= flags;
+
+ if ((lli->lli_flags & LLIF_DONE_WRITING) &&
+ list_empty(&club->cob_pending_list)) {
+ struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;
+
+ if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+ CWARN("ino %lu/%u(flags %u) som valid it just after recovery\n",
+ inode->i_ino, inode->i_generation,
+ lli->lli_flags);
+ /* DONE_WRITING is allowed and inode has no dirty page. */
+ spin_lock(&lcq->lcq_lock);
+
+ LASSERT(list_empty(&lli->lli_close_list));
+ CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
+ inode->i_ino, inode->i_generation);
+ list_add_tail(&lli->lli_close_list, &lcq->lcq_head);
+
+ /* Avoid a concurrent insertion into the close thread queue:
+ * an inode is already in the close thread, open(), write(),
+ * close() happen, epoch is closed as the inode is marked as
+ * LLIF_EPOCH_PENDING. When pages are written inode should not
+ * be inserted into the queue again, clear this flag to avoid
+ * it. */
+ lli->lli_flags &= ~LLIF_DONE_WRITING;
+
+ wake_up(&lcq->lcq_waitq);
+ spin_unlock(&lcq->lcq_lock);
+ }
+ spin_unlock(&lli->lli_lock);
+}
+
+/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ op_data->op_flags |= MF_SOM_CHANGE;
+ /* Check if Size-on-MDS attributes are valid. */
+ if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+ CERROR("ino %lu/%u(flags %u) som valid it just after recovery\n",
+ inode->i_ino, inode->i_generation,
+ lli->lli_flags);
+
+ if (!cl_local_size(inode)) {
+ /* Send Size-on-MDS Attributes if valid. */
+ op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET |
+ ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS;
+ }
+}
+
+/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+ struct obd_client_handle **och, unsigned long flags)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+
+ spin_lock(&lli->lli_lock);
+ if (!(list_empty(&club->cob_pending_list))) {
+ if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) {
+ LASSERT(*och != NULL);
+ LASSERT(lli->lli_pending_och == NULL);
+ /* Inode is dirty and there is no pending write done
+ * request yet, DONE_WRITE is to be sent later. */
+ lli->lli_flags |= LLIF_EPOCH_PENDING;
+ lli->lli_pending_och = *och;
+ spin_unlock(&lli->lli_lock);
+
+ inode = igrab(inode);
+ LASSERT(inode);
+ goto out;
+ }
+ if (flags & LLIF_DONE_WRITING) {
+ /* Some pages are still dirty, it is early to send
+ * DONE_WRITE. Wait until all pages will be flushed
+ * and try DONE_WRITE again later. */
+ LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+ lli->lli_flags |= LLIF_DONE_WRITING;
+ spin_unlock(&lli->lli_lock);
+
+ inode = igrab(inode);
+ LASSERT(inode);
+ goto out;
+ }
+ }
+ CDEBUG(D_INODE, "Epoch %llu closed on "DFID"\n",
+ ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid));
+ op_data->op_flags |= MF_EPOCH_CLOSE;
+
+ if (flags & LLIF_DONE_WRITING) {
+ LASSERT(lli->lli_flags & LLIF_SOM_DIRTY);
+ LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+ *och = lli->lli_pending_och;
+ lli->lli_pending_och = NULL;
+ lli->lli_flags &= ~LLIF_EPOCH_PENDING;
+ } else {
+ /* Pack Size-on-MDS inode attributes only if they has changed */
+ if (!(lli->lli_flags & LLIF_SOM_DIRTY)) {
+ spin_unlock(&lli->lli_lock);
+ goto out;
+ }
+
+ /* There is a pending DONE_WRITE -- close epoch with no
+ * attribute change. */
+ if (lli->lli_flags & LLIF_EPOCH_PENDING) {
+ spin_unlock(&lli->lli_lock);
+ goto out;
+ }
+ }
+
+ LASSERT(list_empty(&club->cob_pending_list));
+ lli->lli_flags &= ~LLIF_SOM_DIRTY;
+ spin_unlock(&lli->lli_lock);
+ ll_done_writing_attr(inode, op_data);
+
+out:
+ return;
+}
+
+/**
+ * Cliens updates SOM attributes on MDS (including llog cookies):
+ * obd_getattr with no lock and md_setattr.
+ */
+int ll_som_update(struct inode *inode, struct md_op_data *op_data)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ptlrpc_request *request = NULL;
+ __u32 old_flags;
+ struct obdo *oa;
+ int rc;
+
+ LASSERT(op_data != NULL);
+ if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+ CERROR("ino %lu/%u(flags %u) som valid it just after recovery\n",
+ inode->i_ino, inode->i_generation,
+ lli->lli_flags);
+
+ OBDO_ALLOC(oa);
+ if (!oa) {
+ CERROR("can't allocate memory for Size-on-MDS update.\n");
+ return -ENOMEM;
+ }
+
+ old_flags = op_data->op_flags;
+ op_data->op_flags = MF_SOM_CHANGE;
+
+ /* If inode is already in another epoch, skip getattr from OSTs. */
+ if (lli->lli_ioepoch == op_data->op_ioepoch) {
+ rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch,
+ old_flags & MF_GETATTR_LOCK);
+ if (rc) {
+ oa->o_valid = 0;
+ if (rc != -ENOENT)
+ CERROR("inode_getattr failed (%d): unable to send a Size-on-MDS attribute update for inode %lu/%u\n",
+ rc, inode->i_ino,
+ inode->i_generation);
+ } else {
+ CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n",
+ PFID(&lli->lli_fid));
+ }
+ /* Install attributes into op_data. */
+ md_from_obdo(op_data, oa, oa->o_valid);
+ }
+
+ rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data,
+ NULL, 0, NULL, 0, &request, NULL);
+ ptlrpc_req_finished(request);
+
+ OBDO_FREE(oa);
+ return rc;
+}
+
+/**
+ * Closes the ioepoch and packs all the attributes into @op_data for
+ * DONE_WRITING rpc.
+ */
+static void ll_prepare_done_writing(struct inode *inode,
+ struct md_op_data *op_data,
+ struct obd_client_handle **och)
+{
+ ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING);
+ /* If there is no @och, we do not do D_W yet. */
+ if (*och == NULL)
+ return;
+
+ ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh);
+ ll_prep_md_op_data(op_data, inode, NULL, NULL,
+ 0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+/** Send a DONE_WRITING rpc. */
+static void ll_done_writing(struct inode *inode)
+{
+ struct obd_client_handle *och = NULL;
+ struct md_op_data *op_data;
+ int rc;
+
+ LASSERT(exp_connect_som(ll_i2mdexp(inode)));
+
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data)
+ return;
+
+ ll_prepare_done_writing(inode, op_data, &och);
+ /* If there is no @och, we do not do D_W yet. */
+ if (och == NULL)
+ goto out;
+
+ rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
+ if (rc == -EAGAIN) {
+ /* MDS has instructed us to obtain Size-on-MDS attribute from
+ * OSTs and send setattr to back to MDS. */
+ rc = ll_som_update(inode, op_data);
+ } else if (rc) {
+ CERROR("inode %lu mdc done_writing failed: rc = %d\n",
+ inode->i_ino, rc);
+ }
+out:
+ ll_finish_md_op_data(op_data);
+ if (och) {
+ md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
+ OBD_FREE_PTR(och);
+ }
+}
+
+static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq)
+{
+ struct ll_inode_info *lli = NULL;
+
+ spin_lock(&lcq->lcq_lock);
+
+ if (!list_empty(&lcq->lcq_head)) {
+ lli = list_entry(lcq->lcq_head.next, struct ll_inode_info,
+ lli_close_list);
+ list_del_init(&lli->lli_close_list);
+ } else if (atomic_read(&lcq->lcq_stop))
+ lli = ERR_PTR(-EALREADY);
+
+ spin_unlock(&lcq->lcq_lock);
+ return lli;
+}
+
+static int ll_close_thread(void *arg)
+{
+ struct ll_close_queue *lcq = arg;
+
+ complete(&lcq->lcq_comp);
+
+ while (1) {
+ struct l_wait_info lwi = { 0 };
+ struct ll_inode_info *lli;
+ struct inode *inode;
+
+ l_wait_event_exclusive(lcq->lcq_waitq,
+ (lli = ll_close_next_lli(lcq)) != NULL,
+ &lwi);
+ if (IS_ERR(lli))
+ break;
+
+ inode = ll_info2i(lli);
+ CDEBUG(D_INFO, "done_writing for inode %lu/%u\n",
+ inode->i_ino, inode->i_generation);
+ ll_done_writing(inode);
+ iput(inode);
+ }
+
+ CDEBUG(D_INFO, "ll_close exiting\n");
+ complete(&lcq->lcq_comp);
+ return 0;
+}
+
+int ll_close_thread_start(struct ll_close_queue **lcq_ret)
+{
+ struct ll_close_queue *lcq;
+ struct task_struct *task;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD))
+ return -EINTR;
+
+ lcq = kzalloc(sizeof(*lcq), GFP_NOFS);
+ if (!lcq)
+ return -ENOMEM;
+
+ spin_lock_init(&lcq->lcq_lock);
+ INIT_LIST_HEAD(&lcq->lcq_head);
+ init_waitqueue_head(&lcq->lcq_waitq);
+ init_completion(&lcq->lcq_comp);
+
+ task = kthread_run(ll_close_thread, lcq, "ll_close");
+ if (IS_ERR(task)) {
+ OBD_FREE(lcq, sizeof(*lcq));
+ return PTR_ERR(task);
+ }
+
+ wait_for_completion(&lcq->lcq_comp);
+ *lcq_ret = lcq;
+ return 0;
+}
+
+void ll_close_thread_shutdown(struct ll_close_queue *lcq)
+{
+ init_completion(&lcq->lcq_comp);
+ atomic_inc(&lcq->lcq_stop);
+ wake_up(&lcq->lcq_waitq);
+ wait_for_completion(&lcq->lcq_comp);
+ OBD_FREE(lcq, sizeof(*lcq));
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
new file mode 100644
index 000000000..5f918e3c4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -0,0 +1,1521 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include "../include/lustre_debug.h"
+#include "../include/lustre_ver.h"
+#include "../include/lustre_disk.h" /* for s2sbi */
+#include "../include/lustre_eacl.h"
+
+/* for struct cl_lock_descr and struct cl_io */
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+#include "../include/lustre_mdc.h"
+#include "../include/lustre_intent.h"
+#include <linux/compat.h>
+#include <linux/posix_acl_xattr.h>
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF 0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT 0x7fffffffUL
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+#define LUSTRE_FPRIVATE(file) ((file)->private_data)
+
+struct ll_dentry_data {
+ struct lookup_intent *lld_it;
+ unsigned int lld_sa_generation;
+ unsigned int lld_invalid:1;
+ struct rcu_head lld_rcu_head;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+#define LLI_INODE_MAGIC 0x111d0de5
+#define LLI_INODE_DEAD 0xdeadd00d
+
+/* remote client permission cache */
+#define REMOTE_PERM_HASHSIZE 16
+
+struct ll_getname_data {
+ struct dir_context ctx;
+ char *lgd_name; /* points to a buffer with NAME_MAX+1 size */
+ struct lu_fid lgd_fid; /* target fid we are looking for */
+ int lgd_found; /* inode matched? */
+};
+
+/* llite setxid/access permission for user on remote client */
+struct ll_remote_perm {
+ struct hlist_node lrp_list;
+ uid_t lrp_uid;
+ gid_t lrp_gid;
+ uid_t lrp_fsuid;
+ gid_t lrp_fsgid;
+ int lrp_access_perm; /* MAY_READ/WRITE/EXEC, this
+ is access permission with
+ lrp_fsuid/lrp_fsgid. */
+};
+
+enum lli_flags {
+ /* MDS has an authority for the Size-on-MDS attributes. */
+ LLIF_MDS_SIZE_LOCK = (1 << 0),
+ /* Epoch close is postponed. */
+ LLIF_EPOCH_PENDING = (1 << 1),
+ /* DONE WRITING is allowed. */
+ LLIF_DONE_WRITING = (1 << 2),
+ /* Sizeon-on-MDS attributes are changed. An attribute update needs to
+ * be sent to MDS. */
+ LLIF_SOM_DIRTY = (1 << 3),
+ /* File data is modified. */
+ LLIF_DATA_MODIFIED = (1 << 4),
+ /* File is being restored */
+ LLIF_FILE_RESTORING = (1 << 5),
+ /* Xattr cache is attached to the file */
+ LLIF_XATTR_CACHE = (1 << 6),
+};
+
+struct ll_inode_info {
+ __u32 lli_inode_magic;
+ __u32 lli_flags;
+ __u64 lli_ioepoch;
+
+ spinlock_t lli_lock;
+ struct posix_acl *lli_posix_acl;
+
+ struct hlist_head *lli_remote_perms;
+ struct mutex lli_rmtperm_mutex;
+
+ /* identifying fields for both metadata and data stacks. */
+ struct lu_fid lli_fid;
+ /* Parent fid for accessing default stripe data on parent directory
+ * for allocating OST objects after a mknod() and later open-by-FID. */
+ struct lu_fid lli_pfid;
+
+ struct list_head lli_close_list;
+ struct list_head lli_oss_capas;
+ /* open count currently used by capability only, indicate whether
+ * capability needs renewal */
+ atomic_t lli_open_count;
+ struct obd_capa *lli_mds_capa;
+ unsigned long lli_rmtperm_time;
+
+ /* handle is to be sent to MDS later on done_writing and setattr.
+ * Open handle data are needed for the recovery to reconstruct
+ * the inode state on the MDS. XXX: recovery is not ready yet. */
+ struct obd_client_handle *lli_pending_och;
+
+ /* We need all three because every inode may be opened in different
+ * modes */
+ struct obd_client_handle *lli_mds_read_och;
+ struct obd_client_handle *lli_mds_write_och;
+ struct obd_client_handle *lli_mds_exec_och;
+ __u64 lli_open_fd_read_count;
+ __u64 lli_open_fd_write_count;
+ __u64 lli_open_fd_exec_count;
+ /* Protects access to och pointers and their usage counters */
+ struct mutex lli_och_mutex;
+
+ struct inode lli_vfs_inode;
+
+ /* the most recent timestamps obtained from mds */
+ struct ost_lvb lli_lvb;
+ spinlock_t lli_agl_lock;
+
+ /* Try to make the d::member and f::member are aligned. Before using
+ * these members, make clear whether it is directory or not. */
+ union {
+ /* for directory */
+ struct {
+ /* serialize normal readdir and statahead-readdir. */
+ struct mutex d_readdir_mutex;
+
+ /* metadata statahead */
+ /* since parent-child threads can share the same @file
+ * struct, "opendir_key" is the token when dir close for
+ * case of parent exit before child -- it is me should
+ * cleanup the dir readahead. */
+ void *d_opendir_key;
+ struct ll_statahead_info *d_sai;
+ /* protect statahead stuff. */
+ spinlock_t d_sa_lock;
+ /* "opendir_pid" is the token when lookup/revalid
+ * -- I am the owner of dir statahead. */
+ pid_t d_opendir_pid;
+ } d;
+
+#define lli_readdir_mutex u.d.d_readdir_mutex
+#define lli_opendir_key u.d.d_opendir_key
+#define lli_sai u.d.d_sai
+#define lli_sa_lock u.d.d_sa_lock
+#define lli_opendir_pid u.d.d_opendir_pid
+
+ /* for non-directory */
+ struct {
+ struct mutex f_size_mutex;
+ char *f_symlink_name;
+ __u64 f_maxbytes;
+ /*
+ * struct rw_semaphore {
+ * signed long count; // align d.d_def_acl
+ * spinlock_t wait_lock; // align d.d_sa_lock
+ * struct list_head wait_list;
+ * }
+ */
+ struct rw_semaphore f_trunc_sem;
+ struct mutex f_write_mutex;
+
+ struct rw_semaphore f_glimpse_sem;
+ unsigned long f_glimpse_time;
+ struct list_head f_agl_list;
+ __u64 f_agl_index;
+
+ /* for writepage() only to communicate to fsync */
+ int f_async_rc;
+
+ /*
+ * whenever a process try to read/write the file, the
+ * jobid of the process will be saved here, and it'll
+ * be packed into the write PRC when flush later.
+ *
+ * so the read/write statistics for jobid will not be
+ * accurate if the file is shared by different jobs.
+ */
+ char f_jobid[JOBSTATS_JOBID_SIZE];
+ } f;
+
+#define lli_size_mutex u.f.f_size_mutex
+#define lli_symlink_name u.f.f_symlink_name
+#define lli_maxbytes u.f.f_maxbytes
+#define lli_trunc_sem u.f.f_trunc_sem
+#define lli_write_mutex u.f.f_write_mutex
+#define lli_glimpse_sem u.f.f_glimpse_sem
+#define lli_glimpse_time u.f.f_glimpse_time
+#define lli_agl_list u.f.f_agl_list
+#define lli_agl_index u.f.f_agl_index
+#define lli_async_rc u.f.f_async_rc
+#define lli_jobid u.f.f_jobid
+
+ } u;
+
+ /* XXX: For following frequent used members, although they maybe special
+ * used for non-directory object, it is some time-wasting to check
+ * whether the object is directory or not before using them. On the
+ * other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+ * the "ll_inode_info" size even if moving those members into u.f.
+ * So keep them out side.
+ *
+ * In the future, if more members are added only for directory,
+ * some of the following members can be moved into u.f.
+ */
+ bool lli_has_smd;
+ struct cl_object *lli_clob;
+
+ /* mutex to request for layout lock exclusively. */
+ struct mutex lli_layout_mutex;
+ /* Layout version, protected by lli_layout_lock */
+ __u32 lli_layout_gen;
+ spinlock_t lli_layout_lock;
+
+ struct rw_semaphore lli_xattrs_list_rwsem;
+ struct mutex lli_xattrs_enq_lock;
+ struct list_head lli_xattrs;/* ll_xattr_entry->xe_list */
+};
+
+static inline __u32 ll_layout_version_get(struct ll_inode_info *lli)
+{
+ __u32 gen;
+
+ spin_lock(&lli->lli_layout_lock);
+ gen = lli->lli_layout_gen;
+ spin_unlock(&lli->lli_layout_lock);
+
+ return gen;
+}
+
+static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen)
+{
+ spin_lock(&lli->lli_layout_lock);
+ lli->lli_layout_gen = gen;
+ spin_unlock(&lli->lli_layout_lock);
+}
+
+int ll_xattr_cache_destroy(struct inode *inode);
+
+int ll_xattr_cache_get(struct inode *inode,
+ const char *name,
+ char *buffer,
+ size_t size,
+ __u64 valid);
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+/* FIXME: replace the name of this with LL_I to conform to kernel stuff */
+/* static inline struct ll_inode_info *LL_I(struct inode *inode) */
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+ return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+/* default to about 40meg of readahead on a given system. That much tied
+ * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
+#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
+enum ra_stat {
+ RA_STAT_HIT = 0,
+ RA_STAT_MISS,
+ RA_STAT_DISTANT_READPAGE,
+ RA_STAT_MISS_IN_WINDOW,
+ RA_STAT_FAILED_GRAB_PAGE,
+ RA_STAT_FAILED_MATCH,
+ RA_STAT_DISCARDED,
+ RA_STAT_ZERO_LEN,
+ RA_STAT_ZERO_WINDOW,
+ RA_STAT_EOF,
+ RA_STAT_MAX_IN_FLIGHT,
+ RA_STAT_WRONG_GRAB_PAGE,
+ _NR_RA_STAT,
+};
+
+struct ll_ra_info {
+ atomic_t ra_cur_pages;
+ unsigned long ra_max_pages;
+ unsigned long ra_max_pages_per_file;
+ unsigned long ra_max_read_ahead_whole_pages;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+ unsigned long ria_start; /* start offset of read-ahead*/
+ unsigned long ria_end; /* end offset of read-ahead*/
+ /* If stride read pattern is detected, ria_stoff means where
+ * stride read is started. Note: for normal read-ahead, the
+ * value here is meaningless, and also it will not be accessed*/
+ pgoff_t ria_stoff;
+ /* ria_length and ria_pages are the length and pages length in the
+ * stride I/O mode. And they will also be used to check whether
+ * it is stride I/O read-ahead in the read-ahead pages*/
+ unsigned long ria_length;
+ unsigned long ria_pages;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+ pid_t pid;
+ struct obd_histogram pp_r_hist;
+ struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+ struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+ pid_t rw_pid;
+ int rw_op;
+ loff_t rw_range_start;
+ loff_t rw_range_end;
+ loff_t rw_last_file_pos;
+ loff_t rw_offset;
+ size_t rw_smallest_extent;
+ size_t rw_largest_extent;
+ struct ll_file_data *rw_last_file;
+};
+
+enum stats_track_type {
+ STATS_TRACK_ALL = 0, /* track all processes */
+ STATS_TRACK_PID, /* track process with this pid */
+ STATS_TRACK_PPID, /* track processes with this ppid */
+ STATS_TRACK_GID, /* track processes with this gid */
+ STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM 0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK 0x04
+#define LL_SBI_USER_XATTR 0x08 /* support user xattr */
+#define LL_SBI_ACL 0x10 /* support ACL */
+#define LL_SBI_RMT_CLIENT 0x40 /* remote client */
+#define LL_SBI_MDS_CAPA 0x80 /* support mds capa */
+#define LL_SBI_OSS_CAPA 0x100 /* support oss capa */
+#define LL_SBI_LOCALFLOCK 0x200 /* Local flocks support by kernel */
+#define LL_SBI_LRU_RESIZE 0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS 0x800 /* lazystatfs mount option */
+#define LL_SBI_SOM_PREVIEW 0x1000 /* SOM preview mount option */
+#define LL_SBI_32BIT_API 0x2000 /* generate 32 bit inodes. */
+#define LL_SBI_64BIT_HASH 0x4000 /* support 64-bits dir hash/offset */
+#define LL_SBI_AGL_ENABLED 0x8000 /* enable agl */
+#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */
+#define LL_SBI_LAYOUT_LOCK 0x20000 /* layout lock support */
+#define LL_SBI_USER_FID2PATH 0x40000 /* allow fid2path by unprivileged users */
+#define LL_SBI_XATTR_CACHE 0x80000 /* support for xattr cache */
+
+#define LL_SBI_FLAGS { \
+ "nolck", \
+ "checksum", \
+ "flock", \
+ "xattr", \
+ "acl", \
+ "???", \
+ "rmt_client", \
+ "mds_capa", \
+ "oss_capa", \
+ "flock", \
+ "lru_resize", \
+ "lazy_statfs", \
+ "som", \
+ "32bit_api", \
+ "64bit_hash", \
+ "agl", \
+ "verbose", \
+ "layout", \
+ "user_fid2path",\
+ "xattr", \
+}
+
+#define RCE_HASHES 32
+
+struct rmtacl_ctl_entry {
+ struct list_head rce_list;
+ pid_t rce_key; /* hash key */
+ int rce_ops; /* acl operation type */
+};
+
+struct rmtacl_ctl_table {
+ spinlock_t rct_lock;
+ struct list_head rct_entries[RCE_HASHES];
+};
+
+#define EE_HASHES 32
+
+struct eacl_table {
+ spinlock_t et_lock;
+ struct list_head et_entries[EE_HASHES];
+};
+
+struct ll_sb_info {
+ struct list_head ll_list;
+ /* this protects pglist and ra_info. It isn't safe to
+ * grab from interrupt contexts */
+ spinlock_t ll_lock;
+ spinlock_t ll_pp_extent_lock; /* pp_extent entry*/
+ spinlock_t ll_process_lock; /* ll_rw_process_info */
+ struct obd_uuid ll_sb_uuid;
+ struct obd_export *ll_md_exp;
+ struct obd_export *ll_dt_exp;
+ struct proc_dir_entry* ll_proc_root;
+ struct lu_fid ll_root_fid; /* root object fid */
+
+ int ll_flags;
+ unsigned int ll_umounting:1,
+ ll_xattr_cache_enabled:1;
+ struct list_head ll_conn_chain; /* per-conn chain of SBs */
+ struct lustre_client_ocd ll_lco;
+
+ struct list_head ll_orphan_dentry_list; /*please don't ask -p*/
+ struct ll_close_queue *ll_lcq;
+
+ struct lprocfs_stats *ll_stats; /* lprocfs stats counter */
+
+ struct cl_client_cache ll_cache;
+
+ struct lprocfs_stats *ll_ra_stats;
+
+ struct ll_ra_info ll_ra_info;
+ unsigned int ll_namelen;
+ struct file_operations *ll_fop;
+
+ /* =0 - hold lock over whole read/write
+ * >0 - max. chunk to be read/written w/o lock re-acquiring */
+ unsigned long ll_max_rw_chunk;
+ unsigned int ll_md_brw_size; /* used by readdir */
+
+ struct lu_site *ll_site;
+ struct cl_device *ll_cl;
+ /* Statistics */
+ struct ll_rw_extents_info ll_rw_extents_info;
+ int ll_extent_process_count;
+ struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
+ unsigned int ll_offset_process_count;
+ struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
+ unsigned int ll_rw_offset_entry_count;
+ int ll_stats_track_id;
+ enum stats_track_type ll_stats_track_type;
+ int ll_rw_stats_on;
+
+ /* metadata stat-ahead */
+ unsigned int ll_sa_max; /* max statahead RPCs */
+ atomic_t ll_sa_total; /* statahead thread started
+ * count */
+ atomic_t ll_sa_wrong; /* statahead thread stopped for
+ * low hit ratio */
+ atomic_t ll_agl_total; /* AGL thread started count */
+
+ dev_t ll_sdev_orig; /* save s_dev before assign for
+ * clustered nfs */
+ struct rmtacl_ctl_table ll_rct;
+ struct eacl_table ll_et;
+ __kernel_fsid_t ll_fsid;
+};
+
+#define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024)
+
+struct ll_ra_read {
+ pgoff_t lrr_start;
+ pgoff_t lrr_count;
+ struct task_struct *lrr_reader;
+ struct list_head lrr_linkage;
+};
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+ spinlock_t ras_lock;
+ /*
+ * index of the last page that read(2) needed and that wasn't in the
+ * cache. Used by ras_update() to detect seeks.
+ *
+ * XXX nikita: if access seeks into cached region, Lustre doesn't see
+ * this.
+ */
+ unsigned long ras_last_readpage;
+ /*
+ * number of pages read after last read-ahead window reset. As window
+ * is reset on each seek, this is effectively a number of consecutive
+ * accesses. Maybe ->ras_accessed_in_window is better name.
+ *
+ * XXX nikita: window is also reset (by ras_update()) when Lustre
+ * believes that memory pressure evicts read-ahead pages. In that
+ * case, it probably doesn't make sense to expand window to
+ * PTLRPC_MAX_BRW_PAGES on the third access.
+ */
+ unsigned long ras_consecutive_pages;
+ /*
+ * number of read requests after the last read-ahead window reset
+ * As window is reset on each seek, this is effectively the number
+ * on consecutive read request and is used to trigger read-ahead.
+ */
+ unsigned long ras_consecutive_requests;
+ /*
+ * Parameters of current read-ahead window. Handled by
+ * ras_update(). On the initial access to the file or after a seek,
+ * window is reset to 0. After 3 consecutive accesses, window is
+ * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+ * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+ */
+ unsigned long ras_window_start, ras_window_len;
+ /*
+ * Where next read-ahead should start at. This lies within read-ahead
+ * window. Read-ahead window is read in pieces rather than at once
+ * because: 1. lustre limits total number of pages under read-ahead by
+ * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+ * not covered by DLM lock.
+ */
+ unsigned long ras_next_readahead;
+ /*
+ * Total number of ll_file_read requests issued, reads originating
+ * due to mmap are not counted in this total. This value is used to
+ * trigger full file read-ahead after multiple reads to a small file.
+ */
+ unsigned long ras_requests;
+ /*
+ * Page index with respect to the current request, these value
+ * will not be accurate when dealing with reads issued via mmap.
+ */
+ unsigned long ras_request_index;
+ /*
+ * list of struct ll_ra_read's one per read(2) call current in
+ * progress against this file descriptor. Used by read-ahead code,
+ * protected by ->ras_lock.
+ */
+ struct list_head ras_read_beads;
+ /*
+ * The following 3 items are used for detecting the stride I/O
+ * mode.
+ * In stride I/O mode,
+ * ...............|-----data-----|****gap*****|--------|******|....
+ * offset |-stride_pages-|-stride_gap-|
+ * ras_stride_offset = offset;
+ * ras_stride_length = stride_pages + stride_gap;
+ * ras_stride_pages = stride_pages;
+ * Note: all these three items are counted by pages.
+ */
+ unsigned long ras_stride_length;
+ unsigned long ras_stride_pages;
+ pgoff_t ras_stride_offset;
+ /*
+ * number of consecutive stride request count, and it is similar as
+ * ras_consecutive_requests, but used for stride I/O mode.
+ * Note: only more than 2 consecutive stride request are detected,
+ * stride read-ahead will be enable
+ */
+ unsigned long ras_consecutive_stride_requests;
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+ struct ll_readahead_state fd_ras;
+ struct ccc_grouplock fd_grouplock;
+ __u64 lfd_pos;
+ __u32 fd_flags;
+ fmode_t fd_omode;
+ /* openhandle if lease exists for this file.
+ * Borrow lli->lli_och_mutex to protect assignment */
+ struct obd_client_handle *fd_lease_och;
+ struct obd_client_handle *fd_och;
+ struct file *fd_file;
+ /* Indicate whether need to report failure when close.
+ * true: failure is known, not report again.
+ * false: unknown failure, should report. */
+ bool fd_write_failed;
+};
+
+struct lov_stripe_md;
+
+extern spinlock_t inode_lock;
+
+extern struct proc_dir_entry *proc_lustre_fs_root;
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+ return &lli->lli_vfs_inode;
+}
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+ return 1;
+#elif defined(CONFIG_COMPAT)
+ return unlikely(is_compat_task() || (sbi->ll_flags & LL_SBI_32BIT_API));
+#else
+ return unlikely(sbi->ll_flags & LL_SBI_32BIT_API);
+#endif
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
+struct ll_ra_read *ll_ra_read_get(struct file *f);
+
+/* llite/lproc_llite.c */
+#if defined (CONFIG_PROC_FS)
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+ struct super_block *sb, char *osc, char *mdc);
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars);
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+ struct ll_file_data *file, loff_t pos,
+ size_t count, int rw);
+#else
+static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+ struct super_block *sb, char *osc, char *mdc){return 0;}
+static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static inline
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+static inline void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+static inline void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+ struct ll_file_data *file, loff_t pos,
+ size_t count, int rw) {}
+#endif
+
+
+/* llite/dir.c */
+void ll_release_page(struct page *page, int remove);
+extern const struct file_operations ll_dir_operations;
+extern const struct inode_operations ll_dir_inode_operations;
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+ struct ll_dir_chain *chain);
+int ll_dir_read(struct inode *inode, struct dir_context *ctx);
+
+int ll_get_mdt_idx(struct inode *inode);
+/* llite/namei.c */
+extern const struct inode_operations ll_special_inode_operations;
+
+int ll_objects_destroy(struct ptlrpc_request *request,
+ struct inode *dir);
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+ struct lustre_md *lic);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+ void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+
+/* llite/rw.c */
+int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+int ll_readpage(struct file *file, struct page *page);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+ struct ll_readahead_state *ras, struct address_space *mapping,
+ struct cl_page_list *queue, int flags);
+
+#ifndef MS_HAS_NEW_AOPS
+extern const struct address_space_operations ll_aops;
+#else
+extern const struct address_space_operations_ext ll_aops;
+#endif
+
+/* llite/file.c */
+extern struct file_operations ll_file_operations;
+extern struct file_operations ll_file_operations_flock;
+extern struct file_operations ll_file_operations_noflock;
+extern struct inode_operations ll_file_inode_operations;
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+ ldlm_mode_t l_req_mode);
+extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+ struct lustre_handle *lockh, __u64 flags,
+ ldlm_mode_t mode);
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
+ struct lov_stripe_md *lsm, lstat_t *st);
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
+int ll_release_openhandle(struct inode *, struct lookup_intent *);
+int ll_md_real_close(struct inode *inode, fmode_t fmode);
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+ struct obd_client_handle **och, unsigned long flags);
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data);
+int ll_som_update(struct inode *inode, struct md_op_data *op_data);
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+ __u64 ioepoch, int sync);
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+ struct lustre_handle *fh);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+struct posix_acl *ll_get_acl(struct inode *inode, int type);
+
+int ll_inode_permission(struct inode *inode, int mask);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
+ int flags, struct lov_user_md *lum,
+ int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+ struct lov_mds_md **lmm, int *lmm_size,
+ struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+ int set_default);
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+ int *lmm_size, struct ptlrpc_request **request);
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
+int ll_fid2path(struct inode *inode, void __user *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
+int ll_hsm_release(struct inode *inode);
+
+/* llite/dcache.c */
+
+int ll_d_init(struct dentry *de);
+extern const struct dentry_operations ll_d_ops;
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_invalidate_aliases(struct inode *);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+ struct lookup_intent *it, struct inode *inode);
+
+/* llite/llite_lib.c */
+extern struct super_operations lustre_super_operations;
+
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+void ll_clear_inode(struct inode *inode);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import);
+int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+ __u64 max_age, __u32 flags);
+void ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_read_inode2(struct inode *inode, void *opaque);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+void ll_dirty_page_discard_warn(struct page *page, int ioret);
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+ struct super_block *, struct lookup_intent *);
+int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize);
+int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *max_cookiesize);
+int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *default_cookiesize);
+int ll_process_config(struct lustre_cfg *lcfg);
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+ struct inode *i1, struct inode *i2,
+ const char *name, int namelen,
+ int mode, __u32 opc, void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
+
+/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+ const struct lu_fid *fid);
+
+/* llite/symlink.c */
+extern struct inode_operations ll_fast_symlink_inode_operations;
+
+/* llite/llite_close.c */
+struct ll_close_queue {
+ spinlock_t lcq_lock;
+ struct list_head lcq_head;
+ wait_queue_head_t lcq_waitq;
+ struct completion lcq_comp;
+ atomic_t lcq_stop;
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+
+void vvp_write_pending (struct ccc_object *club, struct ccc_page *page);
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
+
+/* specific architecture can implement only part of this list */
+enum vvp_io_subtype {
+ /** normal IO */
+ IO_NORMAL,
+ /** io started from splice_{read|write} */
+ IO_SPLICE
+};
+
+/* IO subtypes */
+struct vvp_io {
+ /** io subtype */
+ enum vvp_io_subtype cui_io_subtype;
+
+ union {
+ struct {
+ struct pipe_inode_info *cui_pipe;
+ unsigned int cui_flags;
+ } splice;
+ struct vvp_fault_io {
+ /**
+ * Inode modification time that is checked across DLM
+ * lock request.
+ */
+ time_t ft_mtime;
+ struct vm_area_struct *ft_vma;
+ /**
+ * locked page returned from vvp_io
+ */
+ struct page *ft_vmpage;
+ struct vm_fault_api {
+ /**
+ * kernel fault info
+ */
+ struct vm_fault *ft_vmf;
+ /**
+ * fault API used bitflags for return code.
+ */
+ unsigned int ft_flags;
+ /**
+ * check that flags are from filemap_fault
+ */
+ bool ft_flags_valid;
+ } fault;
+ } fault;
+ } u;
+ /**
+ * Read-ahead state used by read and page-fault IO contexts.
+ */
+ struct ll_ra_read cui_bead;
+ /**
+ * Set when cui_bead has been initialized.
+ */
+ int cui_ra_window_set;
+};
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+ /** normal/splice */
+ enum vvp_io_subtype via_io_subtype;
+
+ union {
+ struct {
+ struct kiocb *via_iocb;
+ struct iov_iter *via_iter;
+ } normal;
+ struct {
+ struct pipe_inode_info *via_pipe;
+ unsigned int via_flags;
+ } splice;
+ } u;
+};
+
+struct ll_cl_context {
+ void *lcc_cookie;
+ struct cl_io *lcc_io;
+ struct cl_page *lcc_page;
+ struct lu_env *lcc_env;
+ int lcc_refcheck;
+};
+
+struct vvp_thread_info {
+ struct vvp_io_args vti_args;
+ struct ra_io_arg vti_ria;
+ struct ll_cl_context vti_io_ctx;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+ extern struct lu_context_key vvp_key;
+ struct vvp_thread_info *info;
+
+ info = lu_context_key_get(&env->le_ctx, &vvp_key);
+ LASSERT(info != NULL);
+ return info;
+}
+
+static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env,
+ enum vvp_io_subtype type)
+{
+ struct vvp_io_args *ret = &vvp_env_info(env)->vti_args;
+
+ ret->via_io_subtype = type;
+
+ return ret;
+}
+
+struct vvp_session {
+ struct vvp_io vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+ extern struct lu_context_key vvp_session_key;
+ struct vvp_session *ses;
+
+ ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+ LASSERT(ses != NULL);
+ return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+ return &vvp_env_session(env)->vs_ios;
+}
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+void ll_queue_done_writing(struct inode *inode, unsigned long flags);
+void ll_close_thread_shutdown(struct ll_close_queue *lcq);
+int ll_close_thread_start(struct ll_close_queue **lcq_ret);
+
+/* llite/llite_mmap.c */
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file *file, struct vm_area_struct *vma);
+void policy_from_vma(ldlm_policy_data_t *policy,
+ struct vm_area_struct *vma, unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+ size_t count);
+
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+ struct address_space *mapping = vmpage->mapping;
+ loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+ LASSERT(PageLocked(vmpage));
+ if (mapping == NULL)
+ return;
+
+ ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE);
+ truncate_complete_page(mapping, vmpage);
+}
+
+#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+ return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+ return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+ struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+ if (obd == NULL)
+ LBUG();
+ return &obd->u.cli;
+}
+
+/* FIXME: replace the name of this with LL_SB to conform to kernel stuff */
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+ return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+ return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+ return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+ struct lu_fid *fid;
+
+ LASSERT(inode != NULL);
+ fid = &ll_i2info(inode)->lli_fid;
+
+ return fid;
+}
+
+static inline __u64 ll_file_maxbytes(struct inode *inode)
+{
+ return ll_i2info(inode)->lli_maxbytes;
+}
+
+/* llite/xattr.c */
+int ll_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size);
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_removexattr(struct dentry *dentry, const char *name);
+
+/* llite/remote_perm.c */
+extern struct kmem_cache *ll_remote_perm_cachep;
+extern struct kmem_cache *ll_rmtperm_hash_cachep;
+
+void free_rmtperm_hash(struct hlist_head *hash);
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
+int lustre_check_remote_perm(struct inode *inode, int mask);
+
+/* llite/llite_capa.c */
+extern struct timer_list ll_capa_timer;
+
+int ll_capa_thread_start(void);
+void ll_capa_thread_stop(void);
+void ll_capa_timer_callback(unsigned long unused);
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
+
+void ll_capa_open(struct inode *inode);
+void ll_capa_close(struct inode *inode);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode);
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc);
+
+void ll_truncate_free_capa(struct obd_capa *ocapa);
+void ll_clear_inode_capas(struct inode *inode);
+void ll_print_capa_stat(struct ll_sb_info *sbi);
+
+/* llite/llite_cl.c */
+extern struct lu_device_type vvp_device_type;
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+void ll_io_init(struct cl_io *io, const struct file *file, int write);
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+ struct ll_readahead_state *ras, unsigned long index,
+ unsigned hit);
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
+
+/* llite/llite_rmtacl.c */
+#ifdef CONFIG_FS_POSIX_ACL
+struct eacl_entry {
+ struct list_head ee_list;
+ pid_t ee_key; /* hash key */
+ struct lu_fid ee_fid;
+ int ee_type; /* ACL type for ACCESS or DEFAULT */
+ ext_acl_xattr_header *ee_acl;
+};
+
+u64 rce_ops2valid(int ops);
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key);
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops);
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key);
+void rct_init(struct rmtacl_ctl_table *rct);
+void rct_fini(struct rmtacl_ctl_table *rct);
+
+void ee_free(struct eacl_entry *ee);
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+ ext_acl_xattr_header *header);
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+ struct lu_fid *fid, int type);
+void et_search_free(struct eacl_table *et, pid_t key);
+void et_init(struct eacl_table *et);
+void et_fini(struct eacl_table *et);
+#else
+static inline u64 rce_ops2valid(int ops)
+{
+ return 0;
+}
+#endif
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN 2
+#define LL_SA_RPC_DEF 32
+#define LL_SA_RPC_MAX 8192
+
+#define LL_SA_CACHE_BIT 5
+#define LL_SA_CACHE_SIZE (1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK (LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+ struct inode *sai_inode;
+ atomic_t sai_refcount; /* when access this struct, hold
+ * refcount */
+ unsigned int sai_generation; /* generation for statahead */
+ unsigned int sai_max; /* max ahead of lookup */
+ __u64 sai_sent; /* stat requests sent count */
+ __u64 sai_replied; /* stat requests which received
+ * reply */
+ __u64 sai_index; /* index of statahead entry */
+ __u64 sai_index_wait; /* index of entry which is the
+ * caller is waiting for */
+ __u64 sai_hit; /* hit count */
+ __u64 sai_miss; /* miss count:
+ * for "ls -al" case, it includes
+ * hidden dentry miss;
+ * for "ls -l" case, it does not
+ * include hidden dentry miss.
+ * "sai_miss_hidden" is used for
+ * the later case.
+ */
+ unsigned int sai_consecutive_miss; /* consecutive miss */
+ unsigned int sai_miss_hidden;/* "ls -al", but first dentry
+ * is not a hidden one */
+ unsigned int sai_skip_hidden;/* skipped hidden dentry count */
+ unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for
+ * hidden entries */
+ sai_agl_valid:1;/* AGL is valid for the dir */
+ wait_queue_head_t sai_waitq; /* stat-ahead wait queue */
+ struct ptlrpc_thread sai_thread; /* stat-ahead thread */
+ struct ptlrpc_thread sai_agl_thread; /* AGL thread */
+ struct list_head sai_entries; /* entry list */
+ struct list_head sai_entries_received; /* entries returned */
+ struct list_head sai_entries_stated; /* entries stated */
+ struct list_head sai_entries_agl; /* AGL entries to be sent */
+ struct list_head sai_cache[LL_SA_CACHE_SIZE];
+ spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE];
+ atomic_t sai_cache_count; /* entry count in cache */
+};
+
+int do_statahead_enter(struct inode *dir, struct dentry **dentry,
+ int only_unplug);
+void ll_stop_statahead(struct inode *dir, void *key);
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ int rc;
+
+ down_read(&lli->lli_glimpse_sem);
+ rc = cl_glimpse_size(inode);
+ lli->lli_glimpse_time = cfs_time_current();
+ up_read(&lli->lli_glimpse_sem);
+ return rc;
+}
+
+static inline void
+ll_statahead_mark(struct inode *dir, struct dentry *dentry)
+{
+ struct ll_inode_info *lli = ll_i2info(dir);
+ struct ll_statahead_info *sai = lli->lli_sai;
+ struct ll_dentry_data *ldd = ll_d2d(dentry);
+
+ /* not the same process, don't mark */
+ if (lli->lli_opendir_pid != current_pid())
+ return;
+
+ LASSERT(ldd != NULL);
+ if (sai != NULL)
+ ldd->lld_sa_generation = sai->sai_generation;
+}
+
+static inline int
+d_need_statahead(struct inode *dir, struct dentry *dentryp)
+{
+ struct ll_inode_info *lli;
+ struct ll_dentry_data *ldd;
+
+ if (ll_i2sbi(dir)->ll_sa_max == 0)
+ return -EAGAIN;
+
+ lli = ll_i2info(dir);
+ /* not the same process, don't statahead */
+ if (lli->lli_opendir_pid != current_pid())
+ return -EAGAIN;
+
+ /* statahead has been stopped */
+ if (lli->lli_opendir_key == NULL)
+ return -EAGAIN;
+
+ ldd = ll_d2d(dentryp);
+ /*
+ * When stats a dentry, the system trigger more than once "revalidate"
+ * or "lookup", for "getattr", for "getxattr", and maybe for others.
+ * Under patchless client mode, the operation intent is not accurate,
+ * which maybe misguide the statahead thread. For example:
+ * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe
+ * have the same operation intent -- "IT_GETATTR".
+ * In fact, one dentry should has only one chance to interact with the
+ * statahead thread, otherwise the statahead windows will be confused.
+ * The solution is as following:
+ * Assign "lld_sa_generation" with "sai_generation" when a dentry
+ * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
+ * will bypass interacting with statahead thread for checking:
+ * "lld_sa_generation == lli_sai->sai_generation"
+ */
+ if (ldd && lli->lli_sai &&
+ ldd->lld_sa_generation == lli->lli_sai->sai_generation)
+ return -EAGAIN;
+
+ return 1;
+}
+
+static inline int
+ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug)
+{
+ int ret;
+
+ ret = d_need_statahead(dir, *dentryp);
+ if (ret <= 0)
+ return ret;
+
+ return do_statahead_enter(dir, dentryp, only_unplug);
+}
+
+/* llite ioctl register support routine */
+enum llioc_iter {
+ LLIOC_CONT = 0,
+ LLIOC_STOP
+};
+
+#define LLIOC_MAX_CMD 256
+
+/*
+ * Rules to write a callback function:
+ *
+ * Parameters:
+ * @magic: Dynamic ioctl call routine will feed this value with the pointer
+ * returned to ll_iocontrol_register. Callback functions should use this
+ * data to check the potential collasion of ioctl cmd. If collasion is
+ * found, callback function should return LLIOC_CONT.
+ * @rcp: The result of ioctl command.
+ *
+ * Return values:
+ * If @magic matches the pointer returned by ll_iocontrol_data, the
+ * callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
+ */
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
+ struct file *file, unsigned int cmd, unsigned long arg,
+ void *magic, int *rcp);
+
+/* export functions */
+/* Register ioctl block dynamatically for a regular file.
+ *
+ * @cmd: the array of ioctl command set
+ * @count: number of commands in the @cmd
+ * @cb: callback function, it will be called if an ioctl command is found to
+ * belong to the command list @cmd.
+ *
+ * Return value:
+ * A magic pointer will be returned if success;
+ * otherwise, NULL will be returned.
+ * */
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
+void ll_iocontrol_unregister(void *magic);
+
+
+/* lclient compat stuff */
+#define cl_inode_info ll_inode_info
+#define cl_i2info(info) ll_i2info(info)
+#define cl_inode_mode(inode) ((inode)->i_mode)
+#define cl_i2sbi ll_i2sbi
+
+static inline struct ll_file_data *cl_iattr2fd(struct inode *inode,
+ const struct iattr *attr)
+{
+ LASSERT(attr->ia_valid & ATTR_FILE);
+ return LUSTRE_FPRIVATE(attr->ia_file);
+}
+
+static inline void cl_isize_lock(struct inode *inode)
+{
+ ll_inode_size_lock(inode);
+}
+
+static inline void cl_isize_unlock(struct inode *inode)
+{
+ ll_inode_size_unlock(inode);
+}
+
+static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
+{
+ LASSERT(mutex_is_locked(&ll_i2info(inode)->lli_size_mutex));
+ i_size_write(inode, kms);
+}
+
+static inline void cl_isize_write(struct inode *inode, loff_t kms)
+{
+ ll_inode_size_lock(inode);
+ i_size_write(inode, kms);
+ ll_inode_size_unlock(inode);
+}
+
+#define cl_isize_read(inode) i_size_read(inode)
+
+static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+ return ll_merge_lvb(env, inode);
+}
+
+#define cl_inode_atime(inode) LTIME_S((inode)->i_atime)
+#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime)
+#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime)
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+ enum cl_fsync_mode mode, int ignore_layout);
+
+/** direct write pages */
+struct ll_dio_pages {
+ /** page array to be written. we don't support
+ * partial pages except the last one. */
+ struct page **ldp_pages;
+ /* offset of each page */
+ loff_t *ldp_offsets;
+ /** if ldp_offsets is NULL, it means a sequential
+ * pages to be written, then this is the file offset
+ * of the * first page. */
+ loff_t ldp_start_offset;
+ /** how many bytes are to be written. */
+ size_t ldp_size;
+ /** # of pages in the array. */
+ int ldp_nr;
+};
+
+static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt,
+ int rc)
+{
+ int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ :
+ LPROC_LL_OSC_WRITE;
+
+ ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc);
+}
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+ int rw, struct inode *inode,
+ struct ll_dio_pages *pv);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct inode *inode = file_inode(file);
+
+ LASSERT(fd != NULL);
+ return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+ (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+ struct lookup_intent *it, __u64 *bits)
+{
+ if (!it->d.lustre.it_lock_set) {
+ struct lustre_handle handle;
+
+ /* If this inode is a remote object, it will get two
+ * separate locks in different namespaces, Master MDT,
+ * where the name entry is, will grant LOOKUP lock,
+ * remote MDT, where the object is, will grant
+ * UPDATE|PERM lock. The inode will be attached to both
+ * LOOKUP and PERM locks, so revoking either locks will
+ * case the dcache being cleared */
+ if (it->d.lustre.it_remote_lock_mode) {
+ handle.cookie = it->d.lustre.it_remote_lock_handle;
+ CDEBUG(D_DLMTRACE, "setting l_data to inode %p(%lu/%u) for remote lock %#llx\n",
+ inode,
+ inode->i_ino, inode->i_generation,
+ handle.cookie);
+ md_set_lock_data(exp, &handle.cookie, inode, NULL);
+ }
+
+ handle.cookie = it->d.lustre.it_lock_handle;
+
+ CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u) for lock %#llx\n",
+ inode, inode->i_ino,
+ inode->i_generation, handle.cookie);
+
+ md_set_lock_data(exp, &handle.cookie, inode,
+ &it->d.lustre.it_lock_bits);
+ it->d.lustre.it_lock_set = 1;
+ }
+
+ if (bits != NULL)
+ *bits = it->d.lustre.it_lock_bits;
+}
+
+static inline void ll_lock_dcache(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+}
+
+static inline void ll_unlock_dcache(struct inode *inode)
+{
+ spin_unlock(&inode->i_lock);
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+ struct ll_dentry_data *lld = ll_d2d(dentry);
+
+ return (lld == NULL) || lld->lld_invalid;
+}
+
+static inline void __d_lustre_invalidate(struct dentry *dentry)
+{
+ struct ll_dentry_data *lld = ll_d2d(dentry);
+
+ if (lld != NULL)
+ lld->lld_invalid = 1;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later;
+ * else dput() of the last refcount will unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
+{
+ CDEBUG(D_DENTRY, "invalidate dentry %pd (%p) parent %p inode %p refc %d\n",
+ dentry, dentry,
+ dentry->d_parent, d_inode(dentry), d_count(dentry));
+
+ spin_lock_nested(&dentry->d_lock,
+ nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
+ __d_lustre_invalidate(dentry);
+ if (d_count(dentry) == 0)
+ __d_drop(dentry);
+ spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ LASSERT(ll_d2d(dentry) != NULL);
+ ll_d2d(dentry)->lld_invalid = 0;
+ spin_unlock(&dentry->d_lock);
+}
+
+enum {
+ LL_LAYOUT_GEN_NONE = ((__u32)-2), /* layout lock was cancelled */
+ LL_LAYOUT_GEN_EMPTY = ((__u32)-1) /* for empty layout */
+};
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+int ll_layout_restore(struct inode *inode);
+
+int ll_xattr_init(void);
+void ll_xattr_fini(void);
+
+#endif /* LLITE_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
new file mode 100644
index 000000000..a27af7882
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -0,0 +1,2354 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/statfs.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include "../include/lustre_lite.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_log.h"
+#include "../include/cl_object.h"
+#include "../include/obd_cksum.h"
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+struct proc_dir_entry *proc_lustre_fs_root;
+
+static LIST_HEAD(ll_super_blocks);
+static DEFINE_SPINLOCK(ll_sb_lock);
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+ struct ll_sb_info *sbi = NULL;
+ unsigned long pages;
+ unsigned long lru_page_max;
+ struct sysinfo si;
+ class_uuid_t uuid;
+ int i;
+
+ sbi = kzalloc(sizeof(*sbi), GFP_NOFS);
+ if (!sbi)
+ return NULL;
+
+ spin_lock_init(&sbi->ll_lock);
+ mutex_init(&sbi->ll_lco.lco_lock);
+ spin_lock_init(&sbi->ll_pp_extent_lock);
+ spin_lock_init(&sbi->ll_process_lock);
+ sbi->ll_rw_stats_on = 0;
+
+ si_meminfo(&si);
+ pages = si.totalram - si.totalhigh;
+ if (pages >> (20 - PAGE_CACHE_SHIFT) < 512)
+ lru_page_max = pages / 2;
+ else
+ lru_page_max = (pages / 4) * 3;
+
+ /* initialize lru data */
+ atomic_set(&sbi->ll_cache.ccc_users, 0);
+ sbi->ll_cache.ccc_lru_max = lru_page_max;
+ atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max);
+ spin_lock_init(&sbi->ll_cache.ccc_lru_lock);
+ INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru);
+
+ sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
+ SBI_DEFAULT_READAHEAD_MAX);
+ sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
+ sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+ SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+ INIT_LIST_HEAD(&sbi->ll_conn_chain);
+ INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
+
+ ll_generate_random_uuid(uuid);
+ class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+ CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+ spin_lock(&ll_sb_lock);
+ list_add_tail(&sbi->ll_list, &ll_super_blocks);
+ spin_unlock(&ll_sb_lock);
+
+ sbi->ll_flags |= LL_SBI_VERBOSE;
+ sbi->ll_flags |= LL_SBI_CHECKSUM;
+
+ sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+
+ for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+ spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+ pp_r_hist.oh_lock);
+ spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+ pp_w_hist.oh_lock);
+ }
+
+ /* metadata statahead is enabled by default */
+ sbi->ll_sa_max = LL_SA_RPC_DEF;
+ atomic_set(&sbi->ll_sa_total, 0);
+ atomic_set(&sbi->ll_sa_wrong, 0);
+ atomic_set(&sbi->ll_agl_total, 0);
+ sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+
+ return sbi;
+}
+
+static void ll_free_sbi(struct super_block *sb)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ if (sbi != NULL) {
+ spin_lock(&ll_sb_lock);
+ list_del(&sbi->ll_list);
+ spin_unlock(&ll_sb_lock);
+ OBD_FREE(sbi, sizeof(*sbi));
+ }
+}
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
+ struct vfsmount *mnt)
+{
+ struct inode *root = NULL;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct obd_device *obd;
+ struct obd_capa *oc = NULL;
+ struct obd_statfs *osfs = NULL;
+ struct ptlrpc_request *request = NULL;
+ struct obd_connect_data *data = NULL;
+ struct obd_uuid *uuid;
+ struct md_op_data *op_data;
+ struct lustre_md lmd;
+ u64 valid;
+ int size, err, checksum;
+
+ obd = class_name2obd(md);
+ if (!obd) {
+ CERROR("MD %s: not setup or attached\n", md);
+ return -EINVAL;
+ }
+
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+
+ osfs = kzalloc(sizeof(*osfs), GFP_NOFS);
+ if (!osfs) {
+ OBD_FREE_PTR(data);
+ return -ENOMEM;
+ }
+
+ if (proc_lustre_fs_root) {
+ err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
+ dt, md);
+ if (err < 0)
+ CERROR("could not register mount in /proc/fs/lustre\n");
+ }
+
+ /* indicate the features supported by this client */
+ data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH |
+ OBD_CONNECT_ATTRFID |
+ OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE |
+ OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
+ OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 |
+ OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR |
+ OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH|
+ OBD_CONNECT_EINPROGRESS |
+ OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+ OBD_CONNECT_LAYOUTLOCK |
+ OBD_CONNECT_PINGLESS |
+ OBD_CONNECT_MAX_EASIZE |
+ OBD_CONNECT_FLOCK_DEAD |
+ OBD_CONNECT_DISP_STRIPE;
+
+ if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+ data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+ if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
+ data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#ifdef CONFIG_FS_POSIX_ACL
+ data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
+#endif
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+ /* flag mdc connection as lightweight, only used for test
+ * purpose, use with care */
+ data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+ data->ocd_ibits_known = MDS_INODELOCK_FULL;
+ data->ocd_version = LUSTRE_VERSION_CODE;
+
+ if (sb->s_flags & MS_RDONLY)
+ data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+ if (sbi->ll_flags & LL_SBI_USER_XATTR)
+ data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef HAVE_MS_FLOCK_LOCK
+ /* force vfs to use lustre handler for flock() calls - bug 10743 */
+ sb->s_flags |= MS_FLOCK_LOCK;
+#endif
+#ifdef MS_HAS_NEW_AOPS
+ sb->s_flags |= MS_HAS_NEW_AOPS;
+#endif
+
+ if (sbi->ll_flags & LL_SBI_FLOCK)
+ sbi->ll_fop = &ll_file_operations_flock;
+ else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+ sbi->ll_fop = &ll_file_operations;
+ else
+ sbi->ll_fop = &ll_file_operations_noflock;
+
+ /* real client */
+ data->ocd_connect_flags |= OBD_CONNECT_REAL;
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+ data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+ err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid,
+ data, NULL);
+ if (err == -EBUSY) {
+ LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n",
+ md);
+ goto out;
+ } else if (err) {
+ CERROR("cannot connect to %s: rc = %d\n", md, err);
+ goto out;
+ }
+
+ sbi->ll_md_exp->exp_connect_data = *data;
+
+ err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+ LUSTRE_SEQ_METADATA);
+ if (err) {
+ CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n",
+ sbi->ll_md_exp->exp_obd->obd_name, err);
+ goto out_md;
+ }
+
+ /* For mount, we only need fs info from MDT0, and also in DNE, it
+ * can make sure the client can be mounted as long as MDT0 is
+ * available */
+ err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_FOR_MDT0);
+ if (err)
+ goto out_md_fid;
+
+ /* This needs to be after statfs to ensure connect has finished.
+ * Note that "data" does NOT contain the valid connect reply.
+ * If connecting to a 1.8 server there will be no LMV device, so
+ * we can access the MDC export directly and exp_connect_flags will
+ * be non-zero, but if accessing an upgraded 2.1 server it will
+ * have the correct flags filled in.
+ * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+ valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+ if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+ valid != CLIENT_CONNECT_MDT_REQD) {
+ char *buf;
+
+ buf = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ obd_connect_flags2str(buf, PAGE_CACHE_SIZE,
+ valid ^ CLIENT_CONNECT_MDT_REQD, ",");
+ LCONSOLE_ERROR_MSG(0x170, "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n",
+ sbi->ll_md_exp->exp_obd->obd_name, buf);
+ OBD_FREE(buf, PAGE_CACHE_SIZE);
+ err = -EPROTO;
+ goto out_md_fid;
+ }
+
+ size = sizeof(*data);
+ err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+ KEY_CONN_DATA, &size, data, NULL);
+ if (err) {
+ CERROR("%s: Get connect data failed: rc = %d\n",
+ sbi->ll_md_exp->exp_obd->obd_name, err);
+ goto out_md_fid;
+ }
+
+ LASSERT(osfs->os_bsize);
+ sb->s_blocksize = osfs->os_bsize;
+ sb->s_blocksize_bits = log2(osfs->os_bsize);
+ sb->s_magic = LL_SUPER_MAGIC;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sbi->ll_namelen = osfs->os_namelen;
+ sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK;
+
+ if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+ !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+ LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
+ sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+ }
+
+ if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+ sb->s_flags |= MS_POSIXACL;
+#endif
+ sbi->ll_flags |= LL_SBI_ACL;
+ } else {
+ LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef MS_POSIXACL
+ sb->s_flags &= ~MS_POSIXACL;
+#endif
+ sbi->ll_flags &= ~LL_SBI_ACL;
+ }
+
+ if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) {
+ if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+ sbi->ll_flags |= LL_SBI_RMT_CLIENT;
+ LCONSOLE_INFO("client is set as remote by default.\n");
+ }
+ } else {
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+ sbi->ll_flags &= ~LL_SBI_RMT_CLIENT;
+ LCONSOLE_INFO("client claims to be remote, but server rejected, forced to be local.\n");
+ }
+ }
+
+ if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) {
+ LCONSOLE_INFO("client enabled MDS capability!\n");
+ sbi->ll_flags |= LL_SBI_MDS_CAPA;
+ }
+
+ if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) {
+ LCONSOLE_INFO("client enabled OSS capability!\n");
+ sbi->ll_flags |= LL_SBI_OSS_CAPA;
+ }
+
+ if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+ sbi->ll_flags |= LL_SBI_64BIT_HASH;
+
+ if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+ sbi->ll_md_brw_size = data->ocd_brw_size;
+ else
+ sbi->ll_md_brw_size = PAGE_CACHE_SIZE;
+
+ if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) {
+ LCONSOLE_INFO("Layout lock feature supported.\n");
+ sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
+ }
+
+ if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) {
+ if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) {
+ LCONSOLE_INFO(
+ "%s: disabling xattr cache due to unknown maximum xattr size.\n",
+ dt);
+ } else {
+ sbi->ll_flags |= LL_SBI_XATTR_CACHE;
+ sbi->ll_xattr_cache_enabled = 1;
+ }
+ }
+
+ obd = class_name2obd(dt);
+ if (!obd) {
+ CERROR("DT %s: not setup or attached\n", dt);
+ err = -ENODEV;
+ goto out_md_fid;
+ }
+
+ data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION |
+ OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
+ OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK|
+ OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT |
+ OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR|
+ OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH |
+ OBD_CONNECT_MAXBYTES |
+ OBD_CONNECT_EINPROGRESS |
+ OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+ OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+ if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+ data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+ if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
+ /* OBD_CONNECT_CKSUM should always be set, even if checksums are
+ * disabled by default, because it can still be enabled on the
+ * fly via /proc. As a consequence, we still need to come to an
+ * agreement on the supported algorithms at connect time */
+ data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+ data->ocd_cksum_types = OBD_CKSUM_ADLER;
+ else
+ data->ocd_cksum_types = cksum_types_supported_client();
+ }
+
+ data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+ CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n",
+ data->ocd_connect_flags,
+ data->ocd_version, data->ocd_grant);
+
+ obd->obd_upcall.onu_owner = &sbi->ll_lco;
+ obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+ data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+ err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+ NULL);
+ if (err == -EBUSY) {
+ LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n",
+ dt);
+ goto out_md;
+ } else if (err) {
+ CERROR("%s: Cannot connect to %s: rc = %d\n",
+ sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+ goto out_md;
+ }
+
+ sbi->ll_dt_exp->exp_connect_data = *data;
+
+ err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+ LUSTRE_SEQ_METADATA);
+ if (err) {
+ CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n",
+ sbi->ll_dt_exp->exp_obd->obd_name, err);
+ goto out_dt;
+ }
+
+ mutex_lock(&sbi->ll_lco.lco_lock);
+ sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+ sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+ sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+ mutex_unlock(&sbi->ll_lco.lco_lock);
+
+ fid_zero(&sbi->ll_root_fid);
+ err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc);
+ if (err) {
+ CERROR("cannot mds_connect: rc = %d\n", err);
+ goto out_lock_cn_cb;
+ }
+ if (!fid_is_sane(&sbi->ll_root_fid)) {
+ CERROR("%s: Invalid root fid "DFID" during mount\n",
+ sbi->ll_md_exp->exp_obd->obd_name,
+ PFID(&sbi->ll_root_fid));
+ err = -EINVAL;
+ goto out_lock_cn_cb;
+ }
+ CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+ sb->s_op = &lustre_super_operations;
+#if THREAD_SIZE >= 8192 /*b=17630*/
+ sb->s_export_op = &lustre_export_operations;
+#endif
+
+ /* make root inode
+ * XXX: move this to after cbd setup? */
+ valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA;
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ valid |= OBD_MD_FLRMTPERM;
+ else if (sbi->ll_flags & LL_SBI_ACL)
+ valid |= OBD_MD_FLACL;
+
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data) {
+ err = -ENOMEM;
+ goto out_lock_cn_cb;
+ }
+
+ op_data->op_fid1 = sbi->ll_root_fid;
+ op_data->op_mode = 0;
+ op_data->op_capa1 = oc;
+ op_data->op_valid = valid;
+
+ err = md_getattr(sbi->ll_md_exp, op_data, &request);
+ if (oc)
+ capa_put(oc);
+ OBD_FREE_PTR(op_data);
+ if (err) {
+ CERROR("%s: md_getattr failed for root: rc = %d\n",
+ sbi->ll_md_exp->exp_obd->obd_name, err);
+ goto out_lock_cn_cb;
+ }
+
+ err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+ sbi->ll_md_exp, &lmd);
+ if (err) {
+ CERROR("failed to understand root inode md: rc = %d\n", err);
+ ptlrpc_req_finished(request);
+ goto out_lock_cn_cb;
+ }
+
+ LASSERT(fid_is_sane(&sbi->ll_root_fid));
+ root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
+ sbi->ll_flags & LL_SBI_32BIT_API),
+ &lmd);
+ md_free_lustre_md(sbi->ll_md_exp, &lmd);
+ ptlrpc_req_finished(request);
+
+ if (root == NULL || IS_ERR(root)) {
+ if (lmd.lsm)
+ obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
+#ifdef CONFIG_FS_POSIX_ACL
+ if (lmd.posix_acl) {
+ posix_acl_release(lmd.posix_acl);
+ lmd.posix_acl = NULL;
+ }
+#endif
+ err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+ root = NULL;
+ CERROR("lustre_lite: bad iget4 for root\n");
+ goto out_root;
+ }
+
+ err = ll_close_thread_start(&sbi->ll_lcq);
+ if (err) {
+ CERROR("cannot start close thread: rc %d\n", err);
+ goto out_root;
+ }
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+ rct_init(&sbi->ll_rct);
+ et_init(&sbi->ll_et);
+ }
+#endif
+
+ checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
+ err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+ KEY_CHECKSUM, sizeof(checksum), &checksum,
+ NULL);
+ cl_sb_init(sb);
+
+ err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
+ KEY_CACHE_SET, sizeof(sbi->ll_cache),
+ &sbi->ll_cache, NULL);
+
+ sb->s_root = d_make_root(root);
+ if (sb->s_root == NULL) {
+ CERROR("%s: can't make root dentry\n",
+ ll_get_fsname(sb, NULL, 0));
+ err = -ENOMEM;
+ goto out_lock_cn_cb;
+ }
+
+ sbi->ll_sdev_orig = sb->s_dev;
+
+ /* We set sb->s_dev equal on all lustre clients in order to support
+ * NFS export clustering. NFSD requires that the FSID be the same
+ * on all clients. */
+ /* s_dev is also used in lt_compare() to compare two fs, but that is
+ * only a node-local comparison. */
+ uuid = obd_get_uuid(sbi->ll_md_exp);
+ if (uuid != NULL) {
+ sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+ get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), &sbi->ll_fsid);
+ }
+
+ if (data != NULL)
+ OBD_FREE_PTR(data);
+ if (osfs != NULL)
+ OBD_FREE_PTR(osfs);
+
+ return err;
+out_root:
+ iput(root);
+out_lock_cn_cb:
+ obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+ obd_disconnect(sbi->ll_dt_exp);
+ sbi->ll_dt_exp = NULL;
+ /* Make sure all OScs are gone, since cl_cache is accessing sbi. */
+ obd_zombie_barrier();
+out_md_fid:
+ obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+ obd_disconnect(sbi->ll_md_exp);
+ sbi->ll_md_exp = NULL;
+out:
+ if (data != NULL)
+ OBD_FREE_PTR(data);
+ if (osfs != NULL)
+ OBD_FREE_PTR(osfs);
+ lprocfs_unregister_mountpoint(sbi);
+ return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+ int size, rc;
+
+ *lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
+ size = sizeof(int);
+ rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+ KEY_MAX_EASIZE, &size, lmmsize, NULL);
+ if (rc)
+ CERROR("Get max mdsize error rc %d\n", rc);
+
+ return rc;
+}
+
+int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+ int size, rc;
+
+ size = sizeof(int);
+ rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE),
+ KEY_DEFAULT_EASIZE, &size, lmmsize, NULL);
+ if (rc)
+ CERROR("Get default mdsize error rc %d\n", rc);
+
+ return rc;
+}
+
+int ll_get_max_cookiesize(struct ll_sb_info *sbi, int *lmmsize)
+{
+ int size, rc;
+
+ size = sizeof(int);
+ rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_COOKIESIZE),
+ KEY_MAX_COOKIESIZE, &size, lmmsize, NULL);
+ if (rc)
+ CERROR("Get max cookiesize error rc %d\n", rc);
+
+ return rc;
+}
+
+int ll_get_default_cookiesize(struct ll_sb_info *sbi, int *lmmsize)
+{
+ int size, rc;
+
+ size = sizeof(int);
+ rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_COOKIESIZE),
+ KEY_DEFAULT_COOKIESIZE, &size, lmmsize, NULL);
+ if (rc)
+ CERROR("Get default cookiesize error rc %d\n", rc);
+
+ return rc;
+}
+
+static void client_common_put_super(struct super_block *sb)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+ et_fini(&sbi->ll_et);
+ rct_fini(&sbi->ll_rct);
+ }
+#endif
+
+ ll_close_thread_shutdown(sbi->ll_lcq);
+
+ cl_sb_fini(sb);
+
+ list_del(&sbi->ll_conn_chain);
+
+ obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+ obd_disconnect(sbi->ll_dt_exp);
+ sbi->ll_dt_exp = NULL;
+ /* wait till all OSCs are gone, since cl_cache is accessing sbi.
+ * see LU-2543. */
+ obd_zombie_barrier();
+
+ lprocfs_unregister_mountpoint(sbi);
+
+ obd_fid_fini(sbi->ll_md_exp->exp_obd);
+ obd_disconnect(sbi->ll_md_exp);
+ sbi->ll_md_exp = NULL;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+ struct ll_sb_info *sbi;
+
+ /* not init sb ?*/
+ if (!(sb->s_flags & MS_ACTIVE))
+ return;
+
+ sbi = ll_s2sbi(sb);
+ /* we need to restore s_dev from changed for clustered NFS before
+ * put_super because new kernels have cached s_dev and change sb->s_dev
+ * in put_super not affected real removing devices */
+ if (sbi) {
+ sb->s_dev = sbi->ll_sdev_orig;
+ sbi->ll_umounting = 1;
+ }
+}
+
+static inline int ll_set_opt(const char *opt, char *data, int fl)
+{
+ if (strncmp(opt, data, strlen(opt)) != 0)
+ return 0;
+ else
+ return fl;
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, int *flags)
+{
+ int tmp;
+ char *s1 = options, *s2;
+
+ if (!options)
+ return 0;
+
+ CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+ while (*s1) {
+ CDEBUG(D_SUPER, "next opt=%s\n", s1);
+ tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+ if (tmp) {
+ *flags &= ~tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+ if (tmp) {
+ *flags &= ~tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
+ if (tmp) {
+ *flags &= ~tmp;
+ goto next;
+ }
+
+ tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+ if (tmp) {
+ *flags &= ~tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
+ if (tmp) {
+ *flags &= ~tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+ if (tmp) {
+ *flags &= ~tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
+ if (tmp) {
+ *flags |= tmp;
+ goto next;
+ }
+ tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
+ if (tmp) {
+ *flags &= ~tmp;
+ goto next;
+ }
+ LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
+ s1);
+ return -EINVAL;
+
+next:
+ /* Find next opt */
+ s2 = strchr(s1, ',');
+ if (s2 == NULL)
+ break;
+ s1 = s2 + 1;
+ }
+ return 0;
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+ lli->lli_inode_magic = LLI_INODE_MAGIC;
+ lli->lli_flags = 0;
+ lli->lli_ioepoch = 0;
+ lli->lli_maxbytes = MAX_LFS_FILESIZE;
+ spin_lock_init(&lli->lli_lock);
+ lli->lli_posix_acl = NULL;
+ lli->lli_remote_perms = NULL;
+ mutex_init(&lli->lli_rmtperm_mutex);
+ /* Do not set lli_fid, it has been initialized already. */
+ fid_zero(&lli->lli_pfid);
+ INIT_LIST_HEAD(&lli->lli_close_list);
+ INIT_LIST_HEAD(&lli->lli_oss_capas);
+ atomic_set(&lli->lli_open_count, 0);
+ lli->lli_mds_capa = NULL;
+ lli->lli_rmtperm_time = 0;
+ lli->lli_pending_och = NULL;
+ lli->lli_mds_read_och = NULL;
+ lli->lli_mds_write_och = NULL;
+ lli->lli_mds_exec_och = NULL;
+ lli->lli_open_fd_read_count = 0;
+ lli->lli_open_fd_write_count = 0;
+ lli->lli_open_fd_exec_count = 0;
+ mutex_init(&lli->lli_och_mutex);
+ spin_lock_init(&lli->lli_agl_lock);
+ lli->lli_has_smd = false;
+ spin_lock_init(&lli->lli_layout_lock);
+ ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE);
+ lli->lli_clob = NULL;
+
+ init_rwsem(&lli->lli_xattrs_list_rwsem);
+ mutex_init(&lli->lli_xattrs_enq_lock);
+
+ LASSERT(lli->lli_vfs_inode.i_mode != 0);
+ if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+ mutex_init(&lli->lli_readdir_mutex);
+ lli->lli_opendir_key = NULL;
+ lli->lli_sai = NULL;
+ spin_lock_init(&lli->lli_sa_lock);
+ lli->lli_opendir_pid = 0;
+ } else {
+ mutex_init(&lli->lli_size_mutex);
+ lli->lli_symlink_name = NULL;
+ init_rwsem(&lli->lli_trunc_sem);
+ mutex_init(&lli->lli_write_mutex);
+ init_rwsem(&lli->lli_glimpse_sem);
+ lli->lli_glimpse_time = 0;
+ INIT_LIST_HEAD(&lli->lli_agl_list);
+ lli->lli_agl_index = 0;
+ lli->lli_async_rc = 0;
+ }
+ mutex_init(&lli->lli_layout_mutex);
+}
+
+static inline int ll_bdi_register(struct backing_dev_info *bdi)
+{
+ static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+
+ bdi->name = "lustre";
+ return bdi_register(bdi, NULL, "lustre-%d",
+ atomic_inc_return(&ll_bdi_num));
+}
+
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
+{
+ struct lustre_profile *lprof = NULL;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct ll_sb_info *sbi;
+ char *dt = NULL, *md = NULL;
+ char *profilenm = get_profile_name(sb);
+ struct config_llog_instance *cfg;
+ /* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+ const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+ int err;
+
+ CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+ cfg = kzalloc(sizeof(*cfg), GFP_NOFS);
+ if (!cfg)
+ return -ENOMEM;
+
+ try_module_get(THIS_MODULE);
+
+ /* client additional sb info */
+ lsi->lsi_llsbi = sbi = ll_init_sbi();
+ if (!sbi) {
+ module_put(THIS_MODULE);
+ OBD_FREE_PTR(cfg);
+ return -ENOMEM;
+ }
+
+ err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+ if (err)
+ goto out_free;
+
+ err = bdi_init(&lsi->lsi_bdi);
+ if (err)
+ goto out_free;
+ lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+ lsi->lsi_bdi.capabilities = 0;
+ err = ll_bdi_register(&lsi->lsi_bdi);
+ if (err)
+ goto out_free;
+
+ sb->s_bdi = &lsi->lsi_bdi;
+ /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+ sb->s_d_op = &ll_d_ops;
+
+ /* Generate a string unique to this super, in case some joker tries
+ to mount the same fs at two mount points.
+ Use the address of the super itself.*/
+ cfg->cfg_instance = sb;
+ cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+ cfg->cfg_callback = class_config_llog_handler;
+ /* set up client obds */
+ err = lustre_process_log(sb, profilenm, cfg);
+ if (err < 0) {
+ CERROR("Unable to process log: %d\n", err);
+ goto out_free;
+ }
+
+ /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+ lprof = class_get_profile(profilenm);
+ if (lprof == NULL) {
+ LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be read from the MGS. Does that filesystem exist?\n",
+ profilenm);
+ err = -EINVAL;
+ goto out_free;
+ }
+ CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+ lprof->lp_md, lprof->lp_dt);
+
+ dt = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+ if (!dt) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ md = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+ if (!md) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ /* connections, registrations, sb setup */
+ err = client_common_fill_super(sb, md, dt, mnt);
+
+out_free:
+ if (md)
+ OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2);
+ if (dt)
+ OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2);
+ if (err)
+ ll_put_super(sb);
+ else if (sbi->ll_flags & LL_SBI_VERBOSE)
+ LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+ OBD_FREE_PTR(cfg);
+ return err;
+} /* ll_fill_super */
+
+void ll_put_super(struct super_block *sb)
+{
+ struct config_llog_instance cfg, params_cfg;
+ struct obd_device *obd;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ char *profilenm = get_profile_name(sb);
+ int next, force = 1;
+
+ CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+
+ ll_print_capa_stat(sbi);
+
+ cfg.cfg_instance = sb;
+ lustre_end_log(sb, profilenm, &cfg);
+
+ params_cfg.cfg_instance = sb;
+ lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
+
+ if (sbi->ll_md_exp) {
+ obd = class_exp2obd(sbi->ll_md_exp);
+ if (obd)
+ force = obd->obd_force;
+ }
+
+ /* We need to set force before the lov_disconnect in
+ lustre_common_put_super, since l_d cleans up osc's as well. */
+ if (force) {
+ next = 0;
+ while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+ &next)) != NULL) {
+ obd->obd_force = force;
+ }
+ }
+
+ if (sbi->ll_lcq) {
+ /* Only if client_common_fill_super succeeded */
+ client_common_put_super(sb);
+ }
+
+ next = 0;
+ while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)))
+ class_manual_cleanup(obd);
+
+ if (sbi->ll_flags & LL_SBI_VERBOSE)
+ LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+ if (profilenm)
+ class_del_profile(profilenm);
+
+ if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+ bdi_destroy(&lsi->lsi_bdi);
+ lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+ }
+
+ ll_free_sbi(sb);
+ lsi->lsi_llsbi = NULL;
+
+ lustre_common_put_super(sb);
+
+ module_put(THIS_MODULE);
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+ struct inode *inode = NULL;
+
+ /* NOTE: we depend on atomic igrab() -bzzz */
+ lock_res_and_lock(lock);
+ if (lock->l_resource->lr_lvb_inode) {
+ struct ll_inode_info *lli;
+
+ lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+ if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+ inode = igrab(lock->l_resource->lr_lvb_inode);
+ } else {
+ inode = lock->l_resource->lr_lvb_inode;
+ LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO :
+ D_WARNING, lock, "lr_lvb_inode %p is bogus: magic %08x",
+ lock->l_resource->lr_lvb_inode,
+ lli->lli_inode_magic);
+ inode = NULL;
+ }
+ }
+ unlock_res_and_lock(lock);
+ return inode;
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+ inode->i_generation, inode);
+
+ if (S_ISDIR(inode->i_mode)) {
+ /* these should have been cleared in ll_file_release */
+ LASSERT(lli->lli_opendir_key == NULL);
+ LASSERT(lli->lli_sai == NULL);
+ LASSERT(lli->lli_opendir_pid == 0);
+ }
+
+ spin_lock(&lli->lli_lock);
+ ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+ spin_unlock(&lli->lli_lock);
+ md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+ LASSERT(!lli->lli_open_fd_write_count);
+ LASSERT(!lli->lli_open_fd_read_count);
+ LASSERT(!lli->lli_open_fd_exec_count);
+
+ if (lli->lli_mds_write_och)
+ ll_md_real_close(inode, FMODE_WRITE);
+ if (lli->lli_mds_exec_och)
+ ll_md_real_close(inode, FMODE_EXEC);
+ if (lli->lli_mds_read_och)
+ ll_md_real_close(inode, FMODE_READ);
+
+ if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+ OBD_FREE(lli->lli_symlink_name,
+ strlen(lli->lli_symlink_name) + 1);
+ lli->lli_symlink_name = NULL;
+ }
+
+ ll_xattr_cache_destroy(inode);
+
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+ LASSERT(lli->lli_posix_acl == NULL);
+ if (lli->lli_remote_perms) {
+ free_rmtperm_hash(lli->lli_remote_perms);
+ lli->lli_remote_perms = NULL;
+ }
+ }
+#ifdef CONFIG_FS_POSIX_ACL
+ else if (lli->lli_posix_acl) {
+ LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+ LASSERT(lli->lli_remote_perms == NULL);
+ posix_acl_release(lli->lli_posix_acl);
+ lli->lli_posix_acl = NULL;
+ }
+#endif
+ lli->lli_inode_magic = LLI_INODE_DEAD;
+
+ ll_clear_inode_capas(inode);
+ if (!S_ISDIR(inode->i_mode))
+ LASSERT(list_empty(&lli->lli_agl_list));
+
+ /*
+ * XXX This has to be done before lsm is freed below, because
+ * cl_object still uses inode lsm.
+ */
+ cl_inode_fini(inode);
+ lli->lli_has_smd = false;
+}
+
+static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+ struct md_open_data **mod)
+{
+ struct lustre_md md;
+ struct inode *inode = d_inode(dentry);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ptlrpc_request *request = NULL;
+ int rc, ia_valid;
+
+ op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
+ &request, mod);
+ if (rc) {
+ ptlrpc_req_finished(request);
+ if (rc == -ENOENT) {
+ clear_nlink(inode);
+ /* Unlinked special device node? Or just a race?
+ * Pretend we done everything. */
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode)) {
+ ia_valid = op_data->op_attr.ia_valid;
+ op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+ rc = simple_setattr(dentry, &op_data->op_attr);
+ op_data->op_attr.ia_valid = ia_valid;
+ }
+ } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+ CERROR("md_setattr fails: rc = %d\n", rc);
+ }
+ return rc;
+ }
+
+ rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+ sbi->ll_md_exp, &md);
+ if (rc) {
+ ptlrpc_req_finished(request);
+ return rc;
+ }
+
+ ia_valid = op_data->op_attr.ia_valid;
+ /* inode size will be in ll_setattr_ost, can't do it now since dirty
+ * cache is not cleared yet. */
+ op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+ rc = simple_setattr(dentry, &op_data->op_attr);
+ op_data->op_attr.ia_valid = ia_valid;
+
+ /* Extract epoch data if obtained. */
+ op_data->op_handle = md.body->handle;
+ op_data->op_ioepoch = md.body->ioepoch;
+
+ ll_update_inode(inode, &md);
+ ptlrpc_req_finished(request);
+
+ return rc;
+}
+
+/* Close IO epoch and send Size-on-MDS attribute update. */
+static int ll_setattr_done_writing(struct inode *inode,
+ struct md_op_data *op_data,
+ struct md_open_data *mod)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ int rc = 0;
+
+ LASSERT(op_data != NULL);
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
+ CDEBUG(D_INODE, "Epoch %llu closed on "DFID" for truncate\n",
+ op_data->op_ioepoch, PFID(&lli->lli_fid));
+
+ op_data->op_flags = MF_EPOCH_CLOSE;
+ ll_done_writing_attr(inode, op_data);
+ ll_pack_inode2opdata(inode, op_data, NULL);
+
+ rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
+ if (rc == -EAGAIN) {
+ /* MDS has instructed us to obtain Size-on-MDS attribute
+ * from OSTs and send setattr to back to MDS. */
+ rc = ll_som_update(inode, op_data);
+ } else if (rc) {
+ CERROR("inode %lu mdc truncate failed: rc = %d\n",
+ inode->i_ino, rc);
+ }
+ return rc;
+}
+
+static int ll_setattr_ost(struct inode *inode, struct iattr *attr)
+{
+ struct obd_capa *capa;
+ int rc;
+
+ if (attr->ia_valid & ATTR_SIZE)
+ capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
+ else
+ capa = ll_mdscapa_get(inode);
+
+ rc = cl_setattr_ost(inode, attr, capa);
+
+ if (attr->ia_valid & ATTR_SIZE)
+ ll_truncate_free_capa(capa);
+ else
+ capa_put(capa);
+
+ return rc;
+}
+
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime. Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ *
+ * In case of HSMimport, we only set attr on MDS.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct md_op_data *op_data = NULL;
+ struct md_open_data *mod = NULL;
+ bool file_is_released = false;
+ int rc = 0, rc1 = 0;
+
+ CDEBUG(D_VFSTRACE,
+ "%s: setattr inode %p/fid:"DFID
+ " from %llu to %llu, valid %x, hsm_import %d\n",
+ ll_get_fsname(inode->i_sb, NULL, 0), inode,
+ PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size,
+ attr->ia_valid, hsm_import);
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ /* Check new size against VFS/VM file size limit and rlimit */
+ rc = inode_newsize_ok(inode, attr->ia_size);
+ if (rc)
+ return rc;
+
+ /* The maximum Lustre file size is variable, based on the
+ * OST maximum object size and number of stripes. This
+ * needs another check in addition to the VFS check above. */
+ if (attr->ia_size > ll_file_maxbytes(inode)) {
+ CDEBUG(D_INODE, "file "DFID" too large %llu > %llu\n",
+ PFID(&lli->lli_fid), attr->ia_size,
+ ll_file_maxbytes(inode));
+ return -EFBIG;
+ }
+
+ attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ }
+
+ /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+ if (attr->ia_valid & TIMES_SET_FLAGS) {
+ if ((!uid_eq(current_fsuid(), inode->i_uid)) &&
+ !capable(CFS_CAP_FOWNER))
+ return -EPERM;
+ }
+
+ /* We mark all of the fields "set" so MDS/OST does not re-set them */
+ if (attr->ia_valid & ATTR_CTIME) {
+ attr->ia_ctime = CURRENT_TIME;
+ attr->ia_valid |= ATTR_CTIME_SET;
+ }
+ if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+ (attr->ia_valid & ATTR_ATIME)) {
+ attr->ia_atime = CURRENT_TIME;
+ attr->ia_valid |= ATTR_ATIME_SET;
+ }
+ if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+ (attr->ia_valid & ATTR_MTIME)) {
+ attr->ia_mtime = CURRENT_TIME;
+ attr->ia_valid |= ATTR_MTIME_SET;
+ }
+
+ if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+ CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
+ LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
+ get_seconds());
+
+ /* If we are changing file size, file content is modified, flag it. */
+ if (attr->ia_valid & ATTR_SIZE) {
+ attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags |= LLIF_DATA_MODIFIED;
+ spin_unlock(&lli->lli_lock);
+ }
+
+ /* We always do an MDS RPC, even if we're only changing the size;
+ * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data)
+ return -ENOMEM;
+
+ if (!S_ISDIR(inode->i_mode)) {
+ if (attr->ia_valid & ATTR_SIZE)
+ inode_dio_write_done(inode);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ memcpy(&op_data->op_attr, attr, sizeof(*attr));
+
+ /* Open epoch for truncate. */
+ if (exp_connect_som(ll_i2mdexp(inode)) &&
+ (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
+ op_data->op_flags = MF_EPOCH_OPEN;
+
+ /* truncate on a released file must failed with -ENODATA,
+ * so size must not be set on MDS for released file
+ * but other attributes must be set
+ */
+ if (S_ISREG(inode->i_mode)) {
+ struct lov_stripe_md *lsm;
+ __u32 gen;
+
+ ll_layout_refresh(inode, &gen);
+ lsm = ccc_inode_lsm_get(inode);
+ if (lsm && lsm->lsm_pattern & LOV_PATTERN_F_RELEASED)
+ file_is_released = true;
+ ccc_inode_lsm_put(inode, lsm);
+ }
+
+ /* if not in HSM import mode, clear size attr for released file
+ * we clear the attribute send to MDT in op_data, not the original
+ * received from caller in attr which is used later to
+ * decide return code */
+ if (file_is_released && (attr->ia_valid & ATTR_SIZE) && !hsm_import)
+ op_data->op_attr.ia_valid &= ~ATTR_SIZE;
+
+ rc = ll_md_setattr(dentry, op_data, &mod);
+ if (rc)
+ goto out;
+
+ /* truncate failed (only when non HSM import), others succeed */
+ if (file_is_released) {
+ if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
+ rc = -ENODATA;
+ else
+ rc = 0;
+ goto out;
+ }
+
+ /* RPC to MDT is sent, cancel data modification flag */
+ if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+ spin_unlock(&lli->lli_lock);
+ }
+
+ ll_ioepoch_open(lli, op_data->op_ioepoch);
+ if (!S_ISREG(inode->i_mode)) {
+ rc = 0;
+ goto out;
+ }
+
+ if (attr->ia_valid & (ATTR_SIZE |
+ ATTR_ATIME | ATTR_ATIME_SET |
+ ATTR_MTIME | ATTR_MTIME_SET)) {
+ /* For truncate and utimes sending attributes to OSTs, setting
+ * mtime/atime to the past will be performed under PW [0:EOF]
+ * extent lock (new_size:EOF for truncate). It may seem
+ * excessive to send mtime/atime updates to OSTs when not
+ * setting times to past, but it is necessary due to possible
+ * time de-synchronization between MDT inode and OST objects */
+ if (attr->ia_valid & ATTR_SIZE)
+ down_write(&lli->lli_trunc_sem);
+ rc = ll_setattr_ost(inode, attr);
+ if (attr->ia_valid & ATTR_SIZE)
+ up_write(&lli->lli_trunc_sem);
+ }
+out:
+ if (op_data) {
+ if (op_data->op_ioepoch) {
+ rc1 = ll_setattr_done_writing(inode, op_data, mod);
+ if (!rc)
+ rc = rc1;
+ }
+ ll_finish_md_op_data(op_data);
+ }
+ if (!S_ISDIR(inode->i_mode)) {
+ mutex_lock(&inode->i_mutex);
+ if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
+ inode_dio_wait(inode);
+ }
+
+ ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
+ LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
+
+ return rc;
+}
+
+int ll_setattr(struct dentry *de, struct iattr *attr)
+{
+ int mode = d_inode(de)->i_mode;
+
+ if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+ (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+ attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+
+ if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+ (ATTR_SIZE|ATTR_MODE)) &&
+ (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+ (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+ !(attr->ia_mode & S_ISGID))))
+ attr->ia_valid |= ATTR_FORCE;
+
+ if ((attr->ia_valid & ATTR_MODE) &&
+ (mode & S_ISUID) &&
+ !(attr->ia_mode & S_ISUID) &&
+ !(attr->ia_valid & ATTR_KILL_SUID))
+ attr->ia_valid |= ATTR_KILL_SUID;
+
+ if ((attr->ia_valid & ATTR_MODE) &&
+ ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+ !(attr->ia_mode & S_ISGID) &&
+ !(attr->ia_valid & ATTR_KILL_SGID))
+ attr->ia_valid |= ATTR_KILL_SGID;
+
+ return ll_setattr_raw(de, attr, false);
+}
+
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+ __u64 max_age, __u32 flags)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct obd_statfs obd_osfs;
+ int rc;
+
+ rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+ if (rc) {
+ CERROR("md_statfs fails: rc = %d\n", rc);
+ return rc;
+ }
+
+ osfs->os_type = sb->s_magic;
+
+ CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
+ osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,
+ osfs->os_files);
+
+ if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+ flags |= OBD_STATFS_NODELAY;
+
+ rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+ if (rc) {
+ CERROR("obd_statfs fails: rc = %d\n", rc);
+ return rc;
+ }
+
+ CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
+ obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+ obd_osfs.os_files);
+
+ osfs->os_bsize = obd_osfs.os_bsize;
+ osfs->os_blocks = obd_osfs.os_blocks;
+ osfs->os_bfree = obd_osfs.os_bfree;
+ osfs->os_bavail = obd_osfs.os_bavail;
+
+ /* If we don't have as many objects free on the OST as inodes
+ * on the MDS, we reduce the total number of inodes to
+ * compensate, so that the "inodes in use" number is correct.
+ */
+ if (obd_osfs.os_ffree < osfs->os_ffree) {
+ osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+ obd_osfs.os_ffree;
+ osfs->os_ffree = obd_osfs.os_ffree;
+ }
+
+ return rc;
+}
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+ struct super_block *sb = de->d_sb;
+ struct obd_statfs osfs;
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
+ ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
+
+ /* Some amount of caching on the client is allowed */
+ rc = ll_statfs_internal(sb, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ 0);
+ if (rc)
+ return rc;
+
+ statfs_unpack(sfs, &osfs);
+
+ /* We need to downshift for all 32-bit kernels, because we can't
+ * tell if the kernel is being called via sys_statfs64() or not.
+ * Stop before overflowing f_bsize - in which case it is better
+ * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+ if (sizeof(long) < 8) {
+ while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+ sfs->f_bsize <<= 1;
+
+ osfs.os_blocks >>= 1;
+ osfs.os_bfree >>= 1;
+ osfs.os_bavail >>= 1;
+ }
+ }
+
+ sfs->f_blocks = osfs.os_blocks;
+ sfs->f_bfree = osfs.os_bfree;
+ sfs->f_bavail = osfs.os_bavail;
+ sfs->f_fsid = ll_s2sbi(sb)->ll_fsid;
+ return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+ struct ll_inode_info *lli;
+
+ LASSERT(!S_ISDIR(inode->i_mode));
+
+ lli = ll_i2info(inode);
+ mutex_lock(&lli->lli_size_mutex);
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+ struct ll_inode_info *lli;
+
+ lli = ll_i2info(inode);
+ mutex_unlock(&lli->lli_size_mutex);
+}
+
+void ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct mdt_body *body = md->body;
+ struct lov_stripe_md *lsm = md->lsm;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+ LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+ if (lsm != NULL) {
+ if (!lli->lli_has_smd &&
+ !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+ cl_file_inode_init(inode, md);
+
+ lli->lli_maxbytes = lsm->lsm_maxbytes;
+ if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
+ lli->lli_maxbytes = MAX_LFS_FILESIZE;
+ }
+
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+ if (body->valid & OBD_MD_FLRMTPERM)
+ ll_update_remote_perm(inode, md->remote_perm);
+ }
+#ifdef CONFIG_FS_POSIX_ACL
+ else if (body->valid & OBD_MD_FLACL) {
+ spin_lock(&lli->lli_lock);
+ if (lli->lli_posix_acl)
+ posix_acl_release(lli->lli_posix_acl);
+ lli->lli_posix_acl = md->posix_acl;
+ spin_unlock(&lli->lli_lock);
+ }
+#endif
+ inode->i_ino = cl_fid_build_ino(&body->fid1,
+ sbi->ll_flags & LL_SBI_32BIT_API);
+ inode->i_generation = cl_fid_build_gen(&body->fid1);
+
+ if (body->valid & OBD_MD_FLATIME) {
+ if (body->atime > LTIME_S(inode->i_atime))
+ LTIME_S(inode->i_atime) = body->atime;
+ lli->lli_lvb.lvb_atime = body->atime;
+ }
+ if (body->valid & OBD_MD_FLMTIME) {
+ if (body->mtime > LTIME_S(inode->i_mtime)) {
+ CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n",
+ inode->i_ino, LTIME_S(inode->i_mtime),
+ body->mtime);
+ LTIME_S(inode->i_mtime) = body->mtime;
+ }
+ lli->lli_lvb.lvb_mtime = body->mtime;
+ }
+ if (body->valid & OBD_MD_FLCTIME) {
+ if (body->ctime > LTIME_S(inode->i_ctime))
+ LTIME_S(inode->i_ctime) = body->ctime;
+ lli->lli_lvb.lvb_ctime = body->ctime;
+ }
+ if (body->valid & OBD_MD_FLMODE)
+ inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
+ if (body->valid & OBD_MD_FLTYPE)
+ inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
+ LASSERT(inode->i_mode != 0);
+ if (S_ISREG(inode->i_mode))
+ inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1,
+ LL_MAX_BLKSIZE_BITS);
+ else
+ inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+ if (body->valid & OBD_MD_FLUID)
+ inode->i_uid = make_kuid(&init_user_ns, body->uid);
+ if (body->valid & OBD_MD_FLGID)
+ inode->i_gid = make_kgid(&init_user_ns, body->gid);
+ if (body->valid & OBD_MD_FLFLAGS)
+ inode->i_flags = ll_ext_to_inode_flags(body->flags);
+ if (body->valid & OBD_MD_FLNLINK)
+ set_nlink(inode, body->nlink);
+ if (body->valid & OBD_MD_FLRDEV)
+ inode->i_rdev = old_decode_dev(body->rdev);
+
+ if (body->valid & OBD_MD_FLID) {
+ /* FID shouldn't be changed! */
+ if (fid_is_sane(&lli->lli_fid)) {
+ LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
+ "Trying to change FID "DFID
+ " to the "DFID", inode %lu/%u(%p)\n",
+ PFID(&lli->lli_fid), PFID(&body->fid1),
+ inode->i_ino, inode->i_generation, inode);
+ } else
+ lli->lli_fid = body->fid1;
+ }
+
+ LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+ if (body->valid & OBD_MD_FLSIZE) {
+ if (exp_connect_som(ll_i2mdexp(inode)) &&
+ S_ISREG(inode->i_mode)) {
+ struct lustre_handle lockh;
+ ldlm_mode_t mode;
+
+ /* As it is possible a blocking ast has been processed
+ * by this time, we need to check there is an UPDATE
+ * lock on the client and set LLIF_MDS_SIZE_LOCK holding
+ * it. */
+ mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
+ &lockh, LDLM_FL_CBPENDING,
+ LCK_CR | LCK_CW |
+ LCK_PR | LCK_PW);
+ if (mode) {
+ if (lli->lli_flags & (LLIF_DONE_WRITING |
+ LLIF_EPOCH_PENDING |
+ LLIF_SOM_DIRTY)) {
+ CERROR("ino %lu flags %u still has size authority! do not trust the size got from MDS\n",
+ inode->i_ino, lli->lli_flags);
+ } else {
+ /* Use old size assignment to avoid
+ * deadlock bz14138 & bz14326 */
+ i_size_write(inode, body->size);
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
+ spin_unlock(&lli->lli_lock);
+ }
+ ldlm_lock_decref(&lockh, mode);
+ }
+ } else {
+ /* Use old size assignment to avoid
+ * deadlock bz14138 & bz14326 */
+ i_size_write(inode, body->size);
+
+ CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
+ inode->i_ino, (unsigned long long)body->size);
+ }
+
+ if (body->valid & OBD_MD_FLBLOCKS)
+ inode->i_blocks = body->blocks;
+ }
+
+ if (body->valid & OBD_MD_FLMDSCAPA) {
+ LASSERT(md->mds_capa);
+ ll_add_capa(inode, md->mds_capa);
+ }
+ if (body->valid & OBD_MD_FLOSSCAPA) {
+ LASSERT(md->oss_capa);
+ ll_add_capa(inode, md->oss_capa);
+ }
+
+ if (body->valid & OBD_MD_TSTATE) {
+ if (body->t_state & MS_RESTORE)
+ lli->lli_flags |= LLIF_FILE_RESTORING;
+ }
+}
+
+void ll_read_inode2(struct inode *inode, void *opaque)
+{
+ struct lustre_md *md = opaque;
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+ PFID(&lli->lli_fid), inode);
+
+ LASSERT(!lli->lli_has_smd);
+
+ /* Core attributes from the MDS first. This is a new inode, and
+ * the VFS doesn't zero times in the core inode so we have to do
+ * it ourselves. They will be overwritten by either MDS or OST
+ * attributes - we just need to make sure they aren't newer. */
+ LTIME_S(inode->i_mtime) = 0;
+ LTIME_S(inode->i_atime) = 0;
+ LTIME_S(inode->i_ctime) = 0;
+ inode->i_rdev = 0;
+ ll_update_inode(inode, md);
+
+ /* OIDEBUG(inode); */
+
+ if (S_ISREG(inode->i_mode)) {
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+ inode->i_op = &ll_file_inode_operations;
+ inode->i_fop = sbi->ll_fop;
+ inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &ll_dir_inode_operations;
+ inode->i_fop = &ll_dir_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &ll_fast_symlink_inode_operations;
+ } else {
+ inode->i_op = &ll_special_inode_operations;
+
+ init_special_inode(inode, inode->i_mode,
+ inode->i_rdev);
+ }
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+ struct cl_inode_info *lli = cl_i2info(inode);
+
+ if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+ /* discard all dirty pages before truncating them, required by
+ * osc_extent implementation at LU-1030. */
+ cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+ CL_FSYNC_DISCARD, 1);
+
+ truncate_inode_pages_final(&inode->i_data);
+
+ /* Workaround for LU-118 */
+ if (inode->i_data.nrpages) {
+ spin_lock_irq(&inode->i_data.tree_lock);
+ spin_unlock_irq(&inode->i_data.tree_lock);
+ LASSERTF(inode->i_data.nrpages == 0,
+ "inode=%lu/%u(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n",
+ inode->i_ino, inode->i_generation, inode,
+ inode->i_data.nrpages);
+ }
+ /* Workaround end */
+
+ ll_clear_inode(inode);
+ clear_inode(inode);
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ptlrpc_request *req = NULL;
+ int rc, flags = 0;
+
+ switch (cmd) {
+ case FSFILT_IOC_GETFLAGS: {
+ struct mdt_body *body;
+ struct md_op_data *op_data;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+ 0, 0, LUSTRE_OPC_ANY,
+ NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ op_data->op_valid = OBD_MD_FLFLAGS;
+ rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+ ll_finish_md_op_data(op_data);
+ if (rc) {
+ CERROR("failure %d inode %lu\n", rc, inode->i_ino);
+ return -abs(rc);
+ }
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+ flags = body->flags;
+
+ ptlrpc_req_finished(req);
+
+ return put_user(flags, (int *)arg);
+ }
+ case FSFILT_IOC_SETFLAGS: {
+ struct lov_stripe_md *lsm;
+ struct obd_info oinfo = { { { 0 } } };
+ struct md_op_data *op_data;
+
+ if (get_user(flags, (int *)arg))
+ return -EFAULT;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags;
+ op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+ rc = md_setattr(sbi->ll_md_exp, op_data,
+ NULL, 0, NULL, 0, &req, NULL);
+ ll_finish_md_op_data(op_data);
+ ptlrpc_req_finished(req);
+ if (rc)
+ return rc;
+
+ inode->i_flags = ll_ext_to_inode_flags(flags);
+
+ lsm = ccc_inode_lsm_get(inode);
+ if (!lsm_has_objects(lsm)) {
+ ccc_inode_lsm_put(inode, lsm);
+ return 0;
+ }
+
+ OBDO_ALLOC(oinfo.oi_oa);
+ if (!oinfo.oi_oa) {
+ ccc_inode_lsm_put(inode, lsm);
+ return -ENOMEM;
+ }
+ oinfo.oi_md = lsm;
+ oinfo.oi_oa->o_oi = lsm->lsm_oi;
+ oinfo.oi_oa->o_flags = flags;
+ oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
+ OBD_MD_FLGROUP;
+ oinfo.oi_capa = ll_mdscapa_get(inode);
+ obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
+ rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
+ capa_put(oinfo.oi_capa);
+ OBDO_FREE(oinfo.oi_oa);
+ ccc_inode_lsm_put(inode, lsm);
+
+ if (rc && rc != -EPERM && rc != -EACCES)
+ CERROR("osc_setattr_async fails: rc = %d\n", rc);
+
+ return rc;
+ }
+ default:
+ return -ENOSYS;
+ }
+
+ return 0;
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+ CDEBUG(D_SEC, "flush context for user %d\n",
+ from_kuid(&init_user_ns, current_uid()));
+
+ obd_set_info_async(NULL, sbi->ll_md_exp,
+ sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+ 0, NULL, NULL);
+ obd_set_info_async(NULL, sbi->ll_dt_exp,
+ sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+ 0, NULL, NULL);
+ return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct obd_device *obd;
+ struct obd_ioctl_data *ioc_data;
+
+ CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+ sb->s_count, atomic_read(&sb->s_active));
+
+ obd = class_exp2obd(sbi->ll_md_exp);
+ if (obd == NULL) {
+ CERROR("Invalid MDC connection handle %#llx\n",
+ sbi->ll_md_exp->exp_handle.h_cookie);
+ return;
+ }
+ obd->obd_force = 1;
+
+ obd = class_exp2obd(sbi->ll_dt_exp);
+ if (obd == NULL) {
+ CERROR("Invalid LOV connection handle %#llx\n",
+ sbi->ll_dt_exp->exp_handle.h_cookie);
+ return;
+ }
+ obd->obd_force = 1;
+
+ ioc_data = kzalloc(sizeof(*ioc_data), GFP_NOFS);
+ if (ioc_data) {
+ obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+ sizeof(*ioc_data), ioc_data, NULL);
+
+ obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+ sizeof(*ioc_data), ioc_data, NULL);
+
+ OBD_FREE_PTR(ioc_data);
+ }
+
+ /* Really, we'd like to wait until there are no requests outstanding,
+ * and then continue. For now, we just invalidate the requests,
+ * schedule() and sleep one second if needed, and hope.
+ */
+ schedule();
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ char *profilenm = get_profile_name(sb);
+ int err;
+ __u32 read_only;
+
+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ read_only = *flags & MS_RDONLY;
+ err = obd_set_info_async(NULL, sbi->ll_md_exp,
+ sizeof(KEY_READ_ONLY),
+ KEY_READ_ONLY, sizeof(read_only),
+ &read_only, NULL);
+ if (err) {
+ LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+ profilenm, read_only ?
+ "read-only" : "read-write", err);
+ return err;
+ }
+
+ if (read_only)
+ sb->s_flags |= MS_RDONLY;
+ else
+ sb->s_flags &= ~MS_RDONLY;
+
+ if (sbi->ll_flags & LL_SBI_VERBOSE)
+ LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+ read_only ? "read-only" : "read-write");
+ }
+ return 0;
+}
+
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+ struct super_block *sb, struct lookup_intent *it)
+{
+ struct ll_sb_info *sbi = NULL;
+ struct lustre_md md;
+ int rc;
+
+ LASSERT(*inode || sb);
+ sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+ rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
+ sbi->ll_md_exp, &md);
+ if (rc)
+ return rc;
+
+ if (*inode) {
+ ll_update_inode(*inode, &md);
+ } else {
+ LASSERT(sb != NULL);
+
+ /*
+ * At this point server returns to client's same fid as client
+ * generated for creating. So using ->fid1 is okay here.
+ */
+ LASSERT(fid_is_sane(&md.body->fid1));
+
+ *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
+ sbi->ll_flags & LL_SBI_32BIT_API),
+ &md);
+ if (*inode == NULL || IS_ERR(*inode)) {
+#ifdef CONFIG_FS_POSIX_ACL
+ if (md.posix_acl) {
+ posix_acl_release(md.posix_acl);
+ md.posix_acl = NULL;
+ }
+#endif
+ rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+ *inode = NULL;
+ CERROR("new_inode -fatal: rc %d\n", rc);
+ goto out;
+ }
+ }
+
+ /* Handling piggyback layout lock.
+ * Layout lock can be piggybacked by getattr and open request.
+ * The lsm can be applied to inode only if it comes with a layout lock
+ * otherwise correct layout may be overwritten, for example:
+ * 1. proc1: mdt returns a lsm but not granting layout
+ * 2. layout was changed by another client
+ * 3. proc2: refresh layout and layout lock granted
+ * 4. proc1: to apply a stale layout */
+ if (it != NULL && it->d.lustre.it_lock_mode != 0) {
+ struct lustre_handle lockh;
+ struct ldlm_lock *lock;
+
+ lockh.cookie = it->d.lustre.it_lock_handle;
+ lock = ldlm_handle2lock(&lockh);
+ LASSERT(lock != NULL);
+ if (ldlm_has_layout(lock)) {
+ struct cl_object_conf conf;
+
+ memset(&conf, 0, sizeof(conf));
+ conf.coc_opc = OBJECT_CONF_SET;
+ conf.coc_inode = *inode;
+ conf.coc_lock = lock;
+ conf.u.coc_md = &md;
+ (void)ll_layout_conf(*inode, &conf);
+ }
+ LDLM_LOCK_PUT(lock);
+ }
+
+out:
+ if (md.lsm != NULL)
+ obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+ md_free_lustre_md(sbi->ll_md_exp, &md);
+ return rc;
+}
+
+int ll_obd_statfs(struct inode *inode, void *arg)
+{
+ struct ll_sb_info *sbi = NULL;
+ struct obd_export *exp;
+ char *buf = NULL;
+ struct obd_ioctl_data *data = NULL;
+ __u32 type;
+ __u32 flags;
+ int len = 0, rc;
+
+ if (!inode) {
+ rc = -EINVAL;
+ goto out_statfs;
+ }
+
+ sbi = ll_i2sbi(inode);
+ if (!sbi) {
+ rc = -EINVAL;
+ goto out_statfs;
+ }
+
+ rc = obd_ioctl_getdata(&buf, &len, arg);
+ if (rc)
+ goto out_statfs;
+
+ data = (void *)buf;
+ if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+ !data->ioc_pbuf1 || !data->ioc_pbuf2) {
+ rc = -EINVAL;
+ goto out_statfs;
+ }
+
+ if (data->ioc_inllen1 != sizeof(__u32) ||
+ data->ioc_inllen2 != sizeof(__u32) ||
+ data->ioc_plen1 != sizeof(struct obd_statfs) ||
+ data->ioc_plen2 != sizeof(struct obd_uuid)) {
+ rc = -EINVAL;
+ goto out_statfs;
+ }
+
+ memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+ if (type & LL_STATFS_LMV)
+ exp = sbi->ll_md_exp;
+ else if (type & LL_STATFS_LOV)
+ exp = sbi->ll_dt_exp;
+ else {
+ rc = -ENODEV;
+ goto out_statfs;
+ }
+
+ flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0;
+ rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags);
+ if (rc)
+ goto out_statfs;
+out_statfs:
+ if (buf)
+ obd_ioctl_freedata(buf, len);
+ return rc;
+}
+
+int ll_process_config(struct lustre_cfg *lcfg)
+{
+ char *ptr;
+ void *sb;
+ struct lprocfs_static_vars lvars;
+ unsigned long x;
+ int rc = 0;
+
+ lprocfs_llite_init_vars(&lvars);
+
+ /* The instance name contains the sb: lustre-client-aacfe000 */
+ ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+ if (!ptr || !*(++ptr))
+ return -EINVAL;
+ rc = kstrtoul(ptr, 16, &x);
+ if (rc != 0)
+ return -EINVAL;
+ sb = (void *)x;
+ /* This better be a real Lustre superblock! */
+ LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+ /* Note we have not called client_common_fill_super yet, so
+ proc fns must be able to handle that! */
+ rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
+ lcfg, sb);
+ if (rc > 0)
+ rc = 0;
+ return rc;
+}
+
+/* this function prepares md_op_data hint for passing ot down to MD stack. */
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+ struct inode *i1, struct inode *i2,
+ const char *name, int namelen,
+ int mode, __u32 opc, void *data)
+{
+ LASSERT(i1 != NULL);
+
+ if (namelen > ll_i2sbi(i1)->ll_namelen)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ if (op_data == NULL)
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+
+ if (op_data == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ ll_i2gids(op_data->op_suppgids, i1, i2);
+ op_data->op_fid1 = *ll_inode2fid(i1);
+ op_data->op_capa1 = ll_mdscapa_get(i1);
+
+ if (i2) {
+ op_data->op_fid2 = *ll_inode2fid(i2);
+ op_data->op_capa2 = ll_mdscapa_get(i2);
+ } else {
+ fid_zero(&op_data->op_fid2);
+ op_data->op_capa2 = NULL;
+ }
+
+ op_data->op_name = name;
+ op_data->op_namelen = namelen;
+ op_data->op_mode = mode;
+ op_data->op_mod_time = get_seconds();
+ op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ op_data->op_cap = cfs_curproc_cap_pack();
+ op_data->op_bias = 0;
+ op_data->op_cli_flags = 0;
+ if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+ filename_is_volatile(name, namelen, NULL))
+ op_data->op_bias |= MDS_CREATE_VOLATILE;
+ op_data->op_opc = opc;
+ op_data->op_mds = 0;
+ op_data->op_data = data;
+
+ /* If the file is being opened after mknod() (normally due to NFS)
+ * try to use the default stripe data from parent directory for
+ * allocating OST objects. Try to pass the parent FID to MDS. */
+ if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
+ !ll_i2info(i2)->lli_has_smd) {
+ struct ll_inode_info *lli = ll_i2info(i2);
+
+ spin_lock(&lli->lli_lock);
+ if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
+ op_data->op_fid1 = lli->lli_pfid;
+ spin_unlock(&lli->lli_lock);
+ /** We ignore parent's capability temporary. */
+ }
+
+ /* When called by ll_setattr_raw, file is i1. */
+ if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags)
+ op_data->op_bias |= MDS_DATA_MODIFIED;
+
+ return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+ capa_put(op_data->op_capa1);
+ capa_put(op_data->op_capa2);
+ OBD_FREE_PTR(op_data);
+}
+
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+ struct ll_sb_info *sbi;
+
+ LASSERT((seq != NULL) && (dentry != NULL));
+ sbi = ll_s2sbi(dentry->d_sb);
+
+ if (sbi->ll_flags & LL_SBI_NOLCK)
+ seq_puts(seq, ",nolock");
+
+ if (sbi->ll_flags & LL_SBI_FLOCK)
+ seq_puts(seq, ",flock");
+
+ if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+ seq_puts(seq, ",localflock");
+
+ if (sbi->ll_flags & LL_SBI_USER_XATTR)
+ seq_puts(seq, ",user_xattr");
+
+ if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+ seq_puts(seq, ",lazystatfs");
+
+ if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
+ seq_puts(seq, ",user_fid2path");
+
+ return 0;
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct obd_device *obd;
+
+ if (cmd == OBD_IOC_GETDTNAME)
+ obd = class_exp2obd(sbi->ll_dt_exp);
+ else if (cmd == OBD_IOC_GETMDNAME)
+ obd = class_exp2obd(sbi->ll_md_exp);
+ else
+ return -EINVAL;
+
+ if (!obd)
+ return -ENOENT;
+
+ if (copy_to_user((void *)arg, obd->obd_name,
+ strlen(obd->obd_name) + 1))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
+ * fsname will be returned in this buffer; otherwise, a static buffer will be
+ * used to store the fsname and returned to caller.
+ */
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
+{
+ static char fsname_static[MTI_NAME_MAXLEN];
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ char *ptr;
+ int len;
+
+ if (buf == NULL) {
+ /* this means the caller wants to use static buffer
+ * and it doesn't care about race. Usually this is
+ * in error reporting path */
+ buf = fsname_static;
+ buflen = sizeof(fsname_static);
+ }
+
+ len = strlen(lsi->lsi_lmd->lmd_profile);
+ ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+ if (ptr && (strcmp(ptr, "-client") == 0))
+ len -= 7;
+
+ if (unlikely(len >= buflen))
+ len = buflen - 1;
+ strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+
+void ll_dirty_page_discard_warn(struct page *page, int ioret)
+{
+ char *buf, *path = NULL;
+ struct dentry *dentry = NULL;
+ struct ccc_object *obj = cl_inode2ccc(page->mapping->host);
+
+ /* this can be called inside spin lock so use GFP_ATOMIC. */
+ buf = (char *)__get_free_page(GFP_ATOMIC);
+ if (buf != NULL) {
+ dentry = d_find_alias(page->mapping->host);
+ if (dentry != NULL)
+ path = dentry_path_raw(dentry, buf, PAGE_SIZE);
+ }
+
+ CDEBUG(D_WARNING,
+ "%s: dirty page discard: %s/fid: " DFID "/%s may get corrupted (rc %d)\n",
+ ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
+ s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
+ PFID(&obj->cob_header.coh_lu.loh_fid),
+ (path && !IS_ERR(path)) ? path : "", ioret);
+
+ if (dentry != NULL)
+ dput(dentry);
+
+ if (buf != NULL)
+ free_page((unsigned long)buf);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
new file mode 100644
index 000000000..a90214bb8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -0,0 +1,492 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include "../include/linux/lustre_compat25.h"
+
+static const struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(ldlm_policy_data_t *policy,
+ struct vm_area_struct *vma, unsigned long addr,
+ size_t count)
+{
+ policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
+ (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+ policy->l_extent.end = (policy->l_extent.start + count - 1) |
+ ~CFS_PAGE_MASK;
+}
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+ size_t count)
+{
+ struct vm_area_struct *vma, *ret = NULL;
+
+ /* mmap_sem must have been held by caller. */
+ LASSERT(!down_write_trylock(&mm->mmap_sem));
+
+ for (vma = find_vma(mm, addr);
+ vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+ if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+ vma->vm_flags & VM_SHARED) {
+ ret = vma;
+ break;
+ }
+ }
+ return ret;
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param vma - virtual memory area addressed to page fault
+ * \param env - corespondent lu_env to processing
+ * \param nest - nested level
+ * \param index - page index corespondent to fault.
+ * \parm ra_flags - vma readahead flags.
+ *
+ * \return allocated and initialized env for fault operation.
+ * \retval EINVAL if env can't allocated
+ * \return other error codes from cl_io_init.
+ */
+static struct cl_io *
+ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret,
+ struct cl_env_nest *nest, pgoff_t index,
+ unsigned long *ra_flags)
+{
+ struct file *file = vma->vm_file;
+ struct inode *inode = file_inode(file);
+ struct cl_io *io;
+ struct cl_fault_io *fio;
+ struct lu_env *env;
+ int rc;
+
+ *env_ret = NULL;
+ if (ll_file_nolock(file))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * page fault can be called when lustre IO is
+ * already active for the current thread, e.g., when doing read/write
+ * against user level buffer mapped from Lustre buffer. To avoid
+ * stomping on existing context, optionally force an allocation of a new
+ * one.
+ */
+ env = cl_env_nested_get(nest);
+ if (IS_ERR(env))
+ return ERR_PTR(-EINVAL);
+
+ *env_ret = env;
+
+ io = ccc_env_thread_io(env);
+ io->ci_obj = ll_i2info(inode)->lli_clob;
+ LASSERT(io->ci_obj != NULL);
+
+ fio = &io->u.ci_fault;
+ fio->ft_index = index;
+ fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+ /*
+ * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+ * the kernel will not read other pages not covered by ldlm in
+ * filemap_nopage. we do our readahead in ll_readpage.
+ */
+ if (ra_flags != NULL)
+ *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+ vma->vm_flags &= ~VM_SEQ_READ;
+ vma->vm_flags |= VM_RAND_READ;
+
+ CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+ fio->ft_index, fio->ft_executable);
+
+ rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj);
+ if (rc == 0) {
+ struct ccc_io *cio = ccc_env_io(env);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+ LASSERT(cio->cui_cl.cis_io == io);
+
+ /* mmap lock must be MANDATORY it has to cache
+ * pages. */
+ io->ci_lockreq = CILR_MANDATORY;
+ cio->cui_fd = fd;
+ } else {
+ LASSERT(rc < 0);
+ cl_io_fini(env, io);
+ cl_env_nested_put(nest, env);
+ io = ERR_PTR(rc);
+ }
+
+ return io;
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+ bool *retry)
+{
+ struct lu_env *env;
+ struct cl_io *io;
+ struct vvp_io *vio;
+ struct cl_env_nest nest;
+ int result;
+ sigset_t set;
+ struct inode *inode;
+ struct ll_inode_info *lli;
+
+ LASSERT(vmpage != NULL);
+
+ io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL);
+ if (IS_ERR(io)) {
+ result = PTR_ERR(io);
+ goto out;
+ }
+
+ result = io->ci_result;
+ if (result < 0)
+ goto out_io;
+
+ io->u.ci_fault.ft_mkwrite = 1;
+ io->u.ci_fault.ft_writable = 1;
+
+ vio = vvp_env_io(env);
+ vio->u.fault.ft_vma = vma;
+ vio->u.fault.ft_vmpage = vmpage;
+
+ set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+ /* we grab lli_trunc_sem to exclude truncate case.
+ * Otherwise, we could add dirty pages into osc cache
+ * while truncate is on-going. */
+ inode = ccc_object_inode(io->ci_obj);
+ lli = ll_i2info(inode);
+ down_read(&lli->lli_trunc_sem);
+
+ result = cl_io_loop(env, io);
+
+ up_read(&lli->lli_trunc_sem);
+
+ cfs_restore_sigs(set);
+
+ if (result == 0) {
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ lock_page(vmpage);
+ if (vmpage->mapping == NULL) {
+ unlock_page(vmpage);
+
+ /* page was truncated and lock was cancelled, return
+ * ENODATA so that VM_FAULT_NOPAGE will be returned
+ * to handle_mm_fault(). */
+ if (result == 0)
+ result = -ENODATA;
+ } else if (!PageDirty(vmpage)) {
+ /* race, the page has been cleaned by ptlrpcd after
+ * it was unlocked, it has to be added into dirty
+ * cache again otherwise this soon-to-dirty page won't
+ * consume any grants, even worse if this page is being
+ * transferred because it will break RPC checksum.
+ */
+ unlock_page(vmpage);
+
+ CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n",
+ vmpage, vmpage->index);
+
+ *retry = true;
+ result = -EAGAIN;
+ }
+
+ if (result == 0) {
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags |= LLIF_DATA_MODIFIED;
+ spin_unlock(&lli->lli_lock);
+ }
+ }
+
+out_io:
+ cl_io_fini(env, io);
+ cl_env_nested_put(&nest, env);
+out:
+ CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+ LASSERT(ergo(result == 0, PageLocked(vmpage)));
+
+ return result;
+}
+
+
+
+static inline int to_fault_error(int result)
+{
+ switch (result) {
+ case 0:
+ result = VM_FAULT_LOCKED;
+ break;
+ case -EFAULT:
+ result = VM_FAULT_NOPAGE;
+ break;
+ case -ENOMEM:
+ result = VM_FAULT_OOM;
+ break;
+ default:
+ result = VM_FAULT_SIGBUS;
+ break;
+ }
+ return result;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct lu_env *env;
+ struct cl_io *io;
+ struct vvp_io *vio = NULL;
+ struct page *vmpage;
+ unsigned long ra_flags;
+ struct cl_env_nest nest;
+ int result;
+ int fault_ret = 0;
+
+ io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags);
+ if (IS_ERR(io))
+ return to_fault_error(PTR_ERR(io));
+
+ result = io->ci_result;
+ if (result == 0) {
+ vio = vvp_env_io(env);
+ vio->u.fault.ft_vma = vma;
+ vio->u.fault.ft_vmpage = NULL;
+ vio->u.fault.fault.ft_vmf = vmf;
+ vio->u.fault.fault.ft_flags = 0;
+ vio->u.fault.fault.ft_flags_valid = false;
+
+ result = cl_io_loop(env, io);
+
+ /* ft_flags are only valid if we reached
+ * the call to filemap_fault */
+ if (vio->u.fault.fault.ft_flags_valid)
+ fault_ret = vio->u.fault.fault.ft_flags;
+
+ vmpage = vio->u.fault.ft_vmpage;
+ if (result != 0 && vmpage != NULL) {
+ page_cache_release(vmpage);
+ vmf->page = NULL;
+ }
+ }
+ cl_io_fini(env, io);
+ cl_env_nested_put(&nest, env);
+
+ vma->vm_flags |= ra_flags;
+ if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+ fault_ret |= to_fault_error(result);
+
+ CDEBUG(D_MMAP, "%s fault %d/%d\n",
+ current->comm, fault_ret, result);
+ return fault_ret;
+}
+
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int count = 0;
+ bool printed = false;
+ int result;
+ sigset_t set;
+
+ /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+ * so that it can be killed by admin but not cause segfault by
+ * other signals. */
+ set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+restart:
+ result = ll_fault0(vma, vmf);
+ LASSERT(!(result & VM_FAULT_LOCKED));
+ if (result == 0) {
+ struct page *vmpage = vmf->page;
+
+ /* check if this page has been truncated */
+ lock_page(vmpage);
+ if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
+ unlock_page(vmpage);
+ page_cache_release(vmpage);
+ vmf->page = NULL;
+
+ if (!printed && ++count > 16) {
+ CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n",
+ current->comm);
+ printed = true;
+ }
+
+ goto restart;
+ }
+
+ result = VM_FAULT_LOCKED;
+ }
+ cfs_restore_sigs(set);
+ return result;
+}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int count = 0;
+ bool printed = false;
+ bool retry;
+ int result;
+
+ do {
+ retry = false;
+ result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+ if (!printed && ++count > 16) {
+ CWARN("app(%s): the page %lu of file %lu is under heavy contention.\n",
+ current->comm, vmf->pgoff,
+ file_inode(vma->vm_file)->i_ino);
+ printed = true;
+ }
+ } while (retry);
+
+ switch (result) {
+ case 0:
+ LASSERT(PageLocked(vmf->page));
+ result = VM_FAULT_LOCKED;
+ break;
+ case -ENODATA:
+ case -EFAULT:
+ result = VM_FAULT_NOPAGE;
+ break;
+ case -ENOMEM:
+ result = VM_FAULT_OOM;
+ break;
+ case -EAGAIN:
+ result = VM_FAULT_RETRY;
+ break;
+ default:
+ result = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ return result;
+}
+
+/**
+ * To avoid cancel the locks covering mmapped region for lock cache pressure,
+ * we track the mapped vma count in ccc_object::cob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ccc_object *vob = cl_inode2ccc(inode);
+
+ LASSERT(vma->vm_file);
+ LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+ atomic_inc(&vob->cob_mmap_cnt);
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ccc_object *vob = cl_inode2ccc(inode);
+
+ LASSERT(vma->vm_file);
+ atomic_dec(&vob->cob_mmap_cnt);
+ LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+}
+
+/* XXX put nice comment here. talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
+{
+ int rc = -ENOENT;
+
+ LASSERTF(last > first, "last %llu first %llu\n", last, first);
+ if (mapping_mapped(mapping)) {
+ rc = 0;
+ unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1,
+ last - first + 1, 0);
+ }
+
+ return rc;
+}
+
+static const struct vm_operations_struct ll_file_vm_ops = {
+ .fault = ll_fault,
+ .page_mkwrite = ll_page_mkwrite,
+ .open = ll_vm_open,
+ .close = ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(file);
+ int rc;
+
+ if (ll_file_nolock(file))
+ return -EOPNOTSUPP;
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
+ rc = generic_file_mmap(file, vma);
+ if (rc == 0) {
+ vma->vm_ops = &ll_file_vm_ops;
+ vma->vm_ops->open(vma);
+ /* update the inode's size and mtime */
+ rc = ll_glimpse_size(inode);
+ }
+
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c
new file mode 100644
index 000000000..db43b8138
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c
@@ -0,0 +1,335 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+__u32 get_uuid2int(const char *name, int len)
+{
+ __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+ while (len--) {
+ __u32 key = key1 + (key0 ^ (*name++ * 7152373));
+
+ if (key & 0x80000000)
+ key -= 0x7fffffff;
+ key1 = key0;
+ key0 = key;
+ }
+ return (key0 << 1);
+}
+
+void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid)
+{
+ __u64 key = 0, key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+
+ while (len--) {
+ key = key1 + (key0 ^ (*name++ * 7152373));
+ if (key & 0x8000000000000000ULL)
+ key -= 0x7fffffffffffffffULL;
+ key1 = key0;
+ key0 = key;
+ }
+
+ fsid->val[0] = key;
+ fsid->val[1] = key >> 32;
+}
+
+static int ll_nfs_test_inode(struct inode *inode, void *opaque)
+{
+ return lu_fid_eq(&ll_i2info(inode)->lli_fid,
+ (struct lu_fid *)opaque);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+ const struct lu_fid *fid)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct ptlrpc_request *req = NULL;
+ struct inode *inode = NULL;
+ int eadatalen = 0;
+ unsigned long hash = cl_fid_build_ino(fid,
+ ll_need_32bit_api(sbi));
+ struct md_op_data *op_data;
+ int rc;
+
+ CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+ inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid);
+ if (inode)
+ return inode;
+
+ rc = ll_get_default_mdsize(sbi, &eadatalen);
+ if (rc)
+ return ERR_PTR(rc);
+
+ /* Because inode is NULL, ll_prep_md_op_data can not
+ * be used here. So we allocate op_data ourselves */
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data)
+ return ERR_PTR(-ENOMEM);
+
+ op_data->op_fid1 = *fid;
+ op_data->op_mode = eadatalen;
+ op_data->op_valid = OBD_MD_FLEASIZE;
+
+ /* mds_fid2dentry ignores f_type */
+ rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+ OBD_FREE_PTR(op_data);
+ if (rc) {
+ CERROR("can't get object attrs, fid "DFID", rc %d\n",
+ PFID(fid), rc);
+ return ERR_PTR(rc);
+ }
+ rc = ll_prep_inode(&inode, req, sb, NULL);
+ ptlrpc_req_finished(req);
+ if (rc)
+ return ERR_PTR(rc);
+
+ return inode;
+}
+
+struct lustre_nfs_fid {
+ struct lu_fid lnf_child;
+ struct lu_fid lnf_parent;
+};
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+ struct inode *inode;
+ struct dentry *result;
+
+ CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+ if (!fid_is_sane(fid))
+ return ERR_PTR(-ESTALE);
+
+ inode = search_inode_for_lustre(sb, fid);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ if (is_bad_inode(inode)) {
+ /* we didn't find the right inode.. */
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ /**
+ * It is an anonymous dentry without OST objects created yet.
+ * We have to find the parent to tell MDS how to init lov objects.
+ */
+ if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd &&
+ parent != NULL) {
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ spin_lock(&lli->lli_lock);
+ lli->lli_pfid = *parent;
+ spin_unlock(&lli->lli_lock);
+ }
+
+ result = d_obtain_alias(inode);
+ if (IS_ERR(result)) {
+ iput(inode);
+ return result;
+ }
+
+ return result;
+}
+
+#define LUSTRE_NFS_FID 0x97
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ * at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen,
+ struct inode *parent)
+{
+ struct lustre_nfs_fid *nfs_fid = (void *)fh;
+
+ CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n",
+ inode->i_ino, PFID(ll_inode2fid(inode)), *plen,
+ (int)sizeof(struct lustre_nfs_fid));
+
+ if (*plen < sizeof(struct lustre_nfs_fid) / 4)
+ return 255;
+
+ nfs_fid->lnf_child = *ll_inode2fid(inode);
+ nfs_fid->lnf_parent = *ll_inode2fid(parent);
+ *plen = sizeof(struct lustre_nfs_fid) / 4;
+
+ return LUSTRE_NFS_FID;
+}
+
+static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name,
+ int namelen, loff_t hash, u64 ino,
+ unsigned type)
+{
+ /* It is hack to access lde_fid for comparison with lgd_fid.
+ * So the input 'name' must be part of the 'lu_dirent'. */
+ struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+ struct ll_getname_data *lgd =
+ container_of(ctx, struct ll_getname_data, ctx);
+ struct lu_fid fid;
+
+ fid_le_to_cpu(&fid, &lde->lde_fid);
+ if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+ memcpy(lgd->lgd_name, name, namelen);
+ lgd->lgd_name[namelen] = 0;
+ lgd->lgd_found = 1;
+ }
+ return lgd->lgd_found;
+}
+
+static int ll_get_name(struct dentry *dentry, char *name,
+ struct dentry *child)
+{
+ struct inode *dir = d_inode(dentry);
+ int rc;
+ struct ll_getname_data lgd = {
+ .lgd_name = name,
+ .lgd_fid = ll_i2info(d_inode(child))->lli_fid,
+ .ctx.actor = ll_nfs_get_name_filldir,
+ };
+
+ if (!dir || !S_ISDIR(dir->i_mode)) {
+ rc = -ENOTDIR;
+ goto out;
+ }
+
+ if (!dir->i_fop) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&dir->i_mutex);
+ rc = ll_dir_read(dir, &lgd.ctx);
+ mutex_unlock(&dir->i_mutex);
+ if (!rc && !lgd.lgd_found)
+ rc = -ENOENT;
+out:
+ return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+ if (fh_type != LUSTRE_NFS_FID)
+ return ERR_PTR(-EPROTO);
+
+ return ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent);
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+ if (fh_type != LUSTRE_NFS_FID)
+ return ERR_PTR(-EPROTO);
+
+ return ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL);
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+ struct ptlrpc_request *req = NULL;
+ struct inode *dir = d_inode(dchild);
+ struct ll_sb_info *sbi;
+ struct dentry *result = NULL;
+ struct mdt_body *body;
+ static char dotdot[] = "..";
+ struct md_op_data *op_data;
+ int rc;
+ int lmmsize;
+
+ LASSERT(dir && S_ISDIR(dir->i_mode));
+
+ sbi = ll_s2sbi(dir->i_sb);
+
+ CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n",
+ dir->i_ino, PFID(ll_inode2fid(dir)));
+
+ rc = ll_get_default_mdsize(sbi, &lmmsize);
+ if (rc != 0)
+ return ERR_PTR(rc);
+
+ op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+ strlen(dotdot), lmmsize,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return (void *)op_data;
+
+ rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+ ll_finish_md_op_data(op_data);
+ if (rc) {
+ CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+ return ERR_PTR(rc);
+ }
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body->valid & OBD_MD_FLID);
+
+ CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+ PFID(ll_inode2fid(dir)), PFID(&body->fid1));
+
+ result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL);
+
+ ptlrpc_req_finished(req);
+ return result;
+}
+
+struct export_operations lustre_export_operations = {
+ .get_parent = ll_get_parent,
+ .encode_fh = ll_encode_fh,
+ .get_name = ll_get_name,
+ .fh_to_dentry = ll_fh_to_dentry,
+ .fh_to_parent = ll_fh_to_parent,
+};
diff --git a/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
new file mode 100644
index 000000000..f4da156f3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
@@ -0,0 +1,300 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_rmtacl.c
+ *
+ * Lustre Remote User Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include "../include/lustre_lite.h"
+#include "../include/lustre_eacl.h"
+#include "llite_internal.h"
+
+static inline __u32 rce_hashfunc(uid_t id)
+{
+ return id & (RCE_HASHES - 1);
+}
+
+static inline __u32 ee_hashfunc(uid_t id)
+{
+ return id & (EE_HASHES - 1);
+}
+
+u64 rce_ops2valid(int ops)
+{
+ switch (ops) {
+ case RMT_LSETFACL:
+ return OBD_MD_FLRMTLSETFACL;
+ case RMT_LGETFACL:
+ return OBD_MD_FLRMTLGETFACL;
+ case RMT_RSETFACL:
+ return OBD_MD_FLRMTRSETFACL;
+ case RMT_RGETFACL:
+ return OBD_MD_FLRMTRGETFACL;
+ default:
+ return 0;
+ }
+}
+
+static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops)
+{
+ struct rmtacl_ctl_entry *rce;
+
+ rce = kzalloc(sizeof(*rce), GFP_NOFS);
+ if (!rce)
+ return NULL;
+
+ INIT_LIST_HEAD(&rce->rce_list);
+ rce->rce_key = key;
+ rce->rce_ops = ops;
+
+ return rce;
+}
+
+static void rce_free(struct rmtacl_ctl_entry *rce)
+{
+ if (!list_empty(&rce->rce_list))
+ list_del(&rce->rce_list);
+
+ OBD_FREE_PTR(rce);
+}
+
+static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct,
+ pid_t key)
+{
+ struct rmtacl_ctl_entry *rce;
+ struct list_head *head = &rct->rct_entries[rce_hashfunc(key)];
+
+ list_for_each_entry(rce, head, rce_list)
+ if (rce->rce_key == key)
+ return rce;
+
+ return NULL;
+}
+
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key)
+{
+ struct rmtacl_ctl_entry *rce;
+
+ spin_lock(&rct->rct_lock);
+ rce = __rct_search(rct, key);
+ spin_unlock(&rct->rct_lock);
+ return rce;
+}
+
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops)
+{
+ struct rmtacl_ctl_entry *rce, *e;
+
+ rce = rce_alloc(key, ops);
+ if (rce == NULL)
+ return -ENOMEM;
+
+ spin_lock(&rct->rct_lock);
+ e = __rct_search(rct, key);
+ if (unlikely(e != NULL)) {
+ CWARN("Unexpected stale rmtacl_entry found: [key: %d] [ops: %d]\n",
+ (int)key, ops);
+ rce_free(e);
+ }
+ list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]);
+ spin_unlock(&rct->rct_lock);
+
+ return 0;
+}
+
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key)
+{
+ struct rmtacl_ctl_entry *rce;
+
+ spin_lock(&rct->rct_lock);
+ rce = __rct_search(rct, key);
+ if (rce)
+ rce_free(rce);
+ spin_unlock(&rct->rct_lock);
+
+ return rce ? 0 : -ENOENT;
+}
+
+void rct_init(struct rmtacl_ctl_table *rct)
+{
+ int i;
+
+ spin_lock_init(&rct->rct_lock);
+ for (i = 0; i < RCE_HASHES; i++)
+ INIT_LIST_HEAD(&rct->rct_entries[i]);
+}
+
+void rct_fini(struct rmtacl_ctl_table *rct)
+{
+ struct rmtacl_ctl_entry *rce;
+ int i;
+
+ spin_lock(&rct->rct_lock);
+ for (i = 0; i < RCE_HASHES; i++)
+ while (!list_empty(&rct->rct_entries[i])) {
+ rce = list_entry(rct->rct_entries[i].next,
+ struct rmtacl_ctl_entry, rce_list);
+ rce_free(rce);
+ }
+ spin_unlock(&rct->rct_lock);
+}
+
+
+static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type,
+ ext_acl_xattr_header *header)
+{
+ struct eacl_entry *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_NOFS);
+ if (!ee)
+ return NULL;
+
+ INIT_LIST_HEAD(&ee->ee_list);
+ ee->ee_key = key;
+ ee->ee_fid = *fid;
+ ee->ee_type = type;
+ ee->ee_acl = header;
+
+ return ee;
+}
+
+void ee_free(struct eacl_entry *ee)
+{
+ if (!list_empty(&ee->ee_list))
+ list_del(&ee->ee_list);
+
+ if (ee->ee_acl)
+ lustre_ext_acl_xattr_free(ee->ee_acl);
+
+ OBD_FREE_PTR(ee);
+}
+
+static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key,
+ struct lu_fid *fid, int type)
+{
+ struct eacl_entry *ee;
+ struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+ LASSERT(fid != NULL);
+ list_for_each_entry(ee, head, ee_list)
+ if (ee->ee_key == key) {
+ if (lu_fid_eq(&ee->ee_fid, fid) &&
+ ee->ee_type == type) {
+ list_del_init(&ee->ee_list);
+ return ee;
+ }
+ }
+
+ return NULL;
+}
+
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+ struct lu_fid *fid, int type)
+{
+ struct eacl_entry *ee;
+
+ spin_lock(&et->et_lock);
+ ee = __et_search_del(et, key, fid, type);
+ spin_unlock(&et->et_lock);
+ return ee;
+}
+
+void et_search_free(struct eacl_table *et, pid_t key)
+{
+ struct eacl_entry *ee, *next;
+ struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+ spin_lock(&et->et_lock);
+ list_for_each_entry_safe(ee, next, head, ee_list)
+ if (ee->ee_key == key)
+ ee_free(ee);
+
+ spin_unlock(&et->et_lock);
+}
+
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+ ext_acl_xattr_header *header)
+{
+ struct eacl_entry *ee, *e;
+
+ ee = ee_alloc(key, fid, type, header);
+ if (ee == NULL)
+ return -ENOMEM;
+
+ spin_lock(&et->et_lock);
+ e = __et_search_del(et, key, fid, type);
+ if (unlikely(e != NULL)) {
+ CWARN("Unexpected stale eacl_entry found: [key: %d] [fid: " DFID "] [type: %d]\n",
+ (int)key, PFID(fid), type);
+ ee_free(e);
+ }
+ list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]);
+ spin_unlock(&et->et_lock);
+
+ return 0;
+}
+
+void et_init(struct eacl_table *et)
+{
+ int i;
+
+ spin_lock_init(&et->et_lock);
+ for (i = 0; i < EE_HASHES; i++)
+ INIT_LIST_HEAD(&et->et_entries[i]);
+}
+
+void et_fini(struct eacl_table *et)
+{
+ struct eacl_entry *ee;
+ int i;
+
+ spin_lock(&et->et_lock);
+ for (i = 0; i < EE_HASHES; i++)
+ while (!list_empty(&et->et_entries[i])) {
+ ee = list_entry(et->et_entries[i].next,
+ struct eacl_entry, ee_list);
+ ee_free(ee);
+ }
+ spin_unlock(&et->et_lock);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
new file mode 100644
index 000000000..413a8408e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -0,0 +1,877 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ * linux/drivers/block/loop.c
+ *
+ * Written by Theodore Ts'o, 3/29/93
+ *
+ * Copyright 1993 by Theodore Ts'o. Redistribution of this file is
+ * permitted under the GNU General Public License.
+ *
+ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
+ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
+ *
+ * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
+ *
+ * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
+ *
+ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
+ *
+ * Loadable modules and other fixes by AK, 1998
+ *
+ * Maximum number of loop devices now dynamic via max_loop module parameter.
+ * Russell Kroll <rkroll@exploits.org> 19990701
+ *
+ * Maximum number of loop devices when compiled-in now selectable by passing
+ * max_loop=<1-255> to the kernel on boot.
+ * Erik I. Bols?, <eriki@himolde.no>, Oct 31, 1999
+ *
+ * Completely rewrite request handling to be make_request_fn style and
+ * non blocking, pushing work to a helper thread. Lots of fixes from
+ * Al Viro too.
+ * Jens Axboe <axboe@suse.de>, Nov 2000
+ *
+ * Support up to 256 loop devices
+ * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
+ *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev() */
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/pagevec.h>
+#include <linux/uaccess.h>
+
+#include "../include/lustre_lib.h"
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+#define LLOOP_MAX_SEGMENTS LNET_MAX_IOV
+
+/* Possible states of device */
+enum {
+ LLOOP_UNBOUND,
+ LLOOP_BOUND,
+ LLOOP_RUNDOWN,
+};
+
+struct lloop_device {
+ int lo_number;
+ int lo_refcnt;
+ loff_t lo_offset;
+ loff_t lo_sizelimit;
+ int lo_flags;
+ struct file *lo_backing_file;
+ struct block_device *lo_device;
+ unsigned lo_blocksize;
+
+ gfp_t old_gfp_mask;
+
+ spinlock_t lo_lock;
+ struct bio *lo_bio;
+ struct bio *lo_biotail;
+ int lo_state;
+ struct semaphore lo_sem;
+ struct mutex lo_ctl_mutex;
+ atomic_t lo_pending;
+ wait_queue_head_t lo_bh_wait;
+
+ struct request_queue *lo_queue;
+
+ const struct lu_env *lo_env;
+ struct cl_io lo_io;
+ struct ll_dio_pages lo_pvec;
+
+ /* data to handle bio for lustre. */
+ struct lo_request_data {
+ struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+ loff_t lrd_offsets[LLOOP_MAX_SEGMENTS];
+ } lo_requests[1];
+};
+
+/*
+ * Loop flags
+ */
+enum {
+ LO_FLAGS_READ_ONLY = 1,
+};
+
+static int lloop_major;
+#define MAX_LOOP_DEFAULT 16
+static int max_loop = MAX_LOOP_DEFAULT;
+static struct lloop_device *loop_dev;
+static struct gendisk **disks;
+static struct mutex lloop_mutex;
+static void *ll_iocontrol_magic = NULL;
+
+static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
+{
+ loff_t size, offset, loopsize;
+
+ /* Compute loopsize in bytes */
+ size = i_size_read(file->f_mapping->host);
+ offset = lo->lo_offset;
+ loopsize = size - offset;
+ if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+ loopsize = lo->lo_sizelimit;
+
+ /*
+ * Unfortunately, if we want to do I/O on the device,
+ * the number of 512-byte sectors has to fit into a sector_t.
+ */
+ return loopsize >> 9;
+}
+
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
+{
+ const struct lu_env *env = lo->lo_env;
+ struct cl_io *io = &lo->lo_io;
+ struct inode *inode = file_inode(lo->lo_backing_file);
+ struct cl_object *obj = ll_i2info(inode)->lli_clob;
+ pgoff_t offset;
+ int ret;
+ int rw;
+ u32 page_count = 0;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ struct bio *bio;
+ ssize_t bytes;
+
+ struct ll_dio_pages *pvec = &lo->lo_pvec;
+ struct page **pages = pvec->ldp_pages;
+ loff_t *offsets = pvec->ldp_offsets;
+
+ truncate_inode_pages(inode->i_mapping, 0);
+
+ /* initialize the IO */
+ memset(io, 0, sizeof(*io));
+ io->ci_obj = obj;
+ ret = cl_io_init(env, io, CIT_MISC, obj);
+ if (ret)
+ return io->ci_result;
+ io->ci_lockreq = CILR_NEVER;
+
+ LASSERT(head != NULL);
+ rw = head->bi_rw;
+ for (bio = head; bio != NULL; bio = bio->bi_next) {
+ LASSERT(rw == bio->bi_rw);
+
+ offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_offset != 0);
+ BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE);
+
+ pages[page_count] = bvec.bv_page;
+ offsets[page_count] = offset;
+ page_count++;
+ offset += bvec.bv_len;
+ }
+ LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
+ }
+
+ ll_stats_ops_tally(ll_i2sbi(inode),
+ (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+ page_count);
+
+ pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+ pvec->ldp_nr = page_count;
+
+ /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+ * write those pages into OST. Even worse case is that more pages
+ * would be asked to write out to swap space, and then finally get here
+ * again.
+ * Unfortunately this is NOT easy to fix.
+ * Thoughts on solution:
+ * 0. Define a reserved pool for cl_pages, which could be a list of
+ * pre-allocated cl_pages;
+ * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+ * which measures how many layers for this lustre object. Generally
+ * speaking, the depth would be 2, one for llite, and one for lovsub.
+ * However, for SNS, there will be more since we need additional page
+ * to store parity;
+ * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+ * pool. Afterwards, the clio would allocate the pages from reserved
+ * pool, this guarantees we needn't allocate the cl_pages from
+ * generic cl_page slab cache.
+ * Of course, if there is NOT enough pages in the pool, we might
+ * be asked to write less pages once, this purely depends on
+ * implementation. Anyway, we should be careful to avoid deadlocking.
+ */
+ mutex_lock(&inode->i_mutex);
+ bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+ mutex_unlock(&inode->i_mutex);
+ cl_io_fini(env, io);
+ return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
+}
+
+/*
+ * Add bio to back of pending list
+ */
+static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lo->lo_lock, flags);
+ if (lo->lo_biotail) {
+ lo->lo_biotail->bi_next = bio;
+ lo->lo_biotail = bio;
+ } else
+ lo->lo_bio = lo->lo_biotail = bio;
+ spin_unlock_irqrestore(&lo->lo_lock, flags);
+
+ atomic_inc(&lo->lo_pending);
+ if (waitqueue_active(&lo->lo_bh_wait))
+ wake_up(&lo->lo_bh_wait);
+}
+
+/*
+ * Grab first pending buffer
+ */
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
+{
+ struct bio *first;
+ struct bio **bio;
+ unsigned int count = 0;
+ unsigned int page_count = 0;
+ int rw;
+
+ spin_lock_irq(&lo->lo_lock);
+ first = lo->lo_bio;
+ if (unlikely(first == NULL)) {
+ spin_unlock_irq(&lo->lo_lock);
+ return 0;
+ }
+
+ /* TODO: need to split the bio, too bad. */
+ LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+ rw = first->bi_rw;
+ bio = &lo->lo_bio;
+ while (*bio && (*bio)->bi_rw == rw) {
+ CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+ (unsigned long long)(*bio)->bi_iter.bi_sector,
+ (*bio)->bi_iter.bi_size,
+ page_count, (*bio)->bi_vcnt);
+ if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+ break;
+
+
+ page_count += (*bio)->bi_vcnt;
+ count++;
+ bio = &(*bio)->bi_next;
+ }
+ if (*bio) {
+ /* Some of bios can't be mergeable. */
+ lo->lo_bio = *bio;
+ *bio = NULL;
+ } else {
+ /* Hit the end of queue */
+ lo->lo_biotail = NULL;
+ lo->lo_bio = NULL;
+ }
+ *req = first;
+ spin_unlock_irq(&lo->lo_lock);
+ return count;
+}
+
+static void loop_make_request(struct request_queue *q, struct bio *old_bio)
+{
+ struct lloop_device *lo = q->queuedata;
+ int rw = bio_rw(old_bio);
+ int inactive;
+
+ if (!lo)
+ goto err;
+
+ CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+ (unsigned long long)old_bio->bi_iter.bi_sector,
+ old_bio->bi_iter.bi_size);
+
+ spin_lock_irq(&lo->lo_lock);
+ inactive = lo->lo_state != LLOOP_BOUND;
+ spin_unlock_irq(&lo->lo_lock);
+ if (inactive)
+ goto err;
+
+ if (rw == WRITE) {
+ if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+ goto err;
+ } else if (rw == READA) {
+ rw = READ;
+ } else if (rw != READ) {
+ CERROR("lloop: unknown command (%x)\n", rw);
+ goto err;
+ }
+ loop_add_bio(lo, old_bio);
+ return;
+err:
+ cfs_bio_io_error(old_bio, old_bio->bi_iter.bi_size);
+}
+
+
+static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
+{
+ int ret;
+ ret = do_bio_lustrebacked(lo, bio);
+ while (bio) {
+ struct bio *tmp = bio->bi_next;
+ bio->bi_next = NULL;
+ cfs_bio_endio(bio, bio->bi_iter.bi_size, ret);
+ bio = tmp;
+ }
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+ return atomic_read(&lo->lo_pending) ||
+ (lo->lo_state == LLOOP_RUNDOWN);
+}
+
+/*
+ * worker thread that handles reads/writes to file backed loop devices,
+ * to avoid blocking in our make_request_fn.
+ */
+static int loop_thread(void *data)
+{
+ struct lloop_device *lo = data;
+ struct bio *bio;
+ unsigned int count;
+ unsigned long times = 0;
+ unsigned long total_count = 0;
+
+ struct lu_env *env;
+ int refcheck;
+ int ret = 0;
+
+ set_user_nice(current, MIN_NICE);
+
+ lo->lo_state = LLOOP_BOUND;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env)) {
+ ret = PTR_ERR(env);
+ goto out;
+ }
+
+ lo->lo_env = env;
+ memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+ lo->lo_pvec.ldp_pages = lo->lo_requests[0].lrd_pages;
+ lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
+
+ /*
+ * up sem, we are running
+ */
+ up(&lo->lo_sem);
+
+ for (;;) {
+ wait_event(lo->lo_bh_wait, loop_active(lo));
+ if (!atomic_read(&lo->lo_pending)) {
+ int exiting = 0;
+ spin_lock_irq(&lo->lo_lock);
+ exiting = (lo->lo_state == LLOOP_RUNDOWN);
+ spin_unlock_irq(&lo->lo_lock);
+ if (exiting)
+ break;
+ }
+
+ bio = NULL;
+ count = loop_get_bio(lo, &bio);
+ if (!count) {
+ CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
+ continue;
+ }
+
+ total_count += count;
+ if (total_count < count) { /* overflow */
+ total_count = count;
+ times = 1;
+ } else {
+ times++;
+ }
+ if ((times & 127) == 0) {
+ CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+ total_count, times, total_count / times);
+ }
+
+ LASSERT(bio != NULL);
+ LASSERT(count <= atomic_read(&lo->lo_pending));
+ loop_handle_bio(lo, bio);
+ atomic_sub(count, &lo->lo_pending);
+ }
+ cl_env_put(env, &refcheck);
+
+out:
+ up(&lo->lo_sem);
+ return ret;
+}
+
+static int loop_set_fd(struct lloop_device *lo, struct file *unused,
+ struct block_device *bdev, struct file *file)
+{
+ struct inode *inode;
+ struct address_space *mapping;
+ int lo_flags = 0;
+ int error;
+ loff_t size;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ error = -EBUSY;
+ if (lo->lo_state != LLOOP_UNBOUND)
+ goto out;
+
+ mapping = file->f_mapping;
+ inode = mapping->host;
+
+ error = -EINVAL;
+ if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
+ goto out;
+
+ if (!(file->f_mode & FMODE_WRITE))
+ lo_flags |= LO_FLAGS_READ_ONLY;
+
+ size = get_loop_size(lo, file);
+
+ if ((loff_t)(sector_t)size != size) {
+ error = -EFBIG;
+ goto out;
+ }
+
+ /* remove all pages in cache so as dirty pages not to be existent. */
+ truncate_inode_pages(mapping, 0);
+
+ set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+ lo->lo_blocksize = PAGE_CACHE_SIZE;
+ lo->lo_device = bdev;
+ lo->lo_flags = lo_flags;
+ lo->lo_backing_file = file;
+ lo->lo_sizelimit = 0;
+ lo->old_gfp_mask = mapping_gfp_mask(mapping);
+ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+ lo->lo_bio = lo->lo_biotail = NULL;
+
+ /*
+ * set queue make_request_fn, and add limits based on lower level
+ * device
+ */
+ blk_queue_make_request(lo->lo_queue, loop_make_request);
+ lo->lo_queue->queuedata = lo;
+
+ /* queue parameters */
+ CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
+ blk_queue_logical_block_size(lo->lo_queue,
+ (unsigned short)PAGE_CACHE_SIZE);
+ blk_queue_max_hw_sectors(lo->lo_queue,
+ LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
+ blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+
+ set_capacity(disks[lo->lo_number], size);
+ bd_set_size(bdev, size << 9);
+
+ set_blocksize(bdev, lo->lo_blocksize);
+
+ kthread_run(loop_thread, lo, "lloop%d", lo->lo_number);
+ down(&lo->lo_sem);
+ return 0;
+
+out:
+ /* This is safe: open() is still holding a reference. */
+ module_put(THIS_MODULE);
+ return error;
+}
+
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
+ int count)
+{
+ struct file *filp = lo->lo_backing_file;
+ gfp_t gfp = lo->old_gfp_mask;
+
+ if (lo->lo_state != LLOOP_BOUND)
+ return -ENXIO;
+
+ if (lo->lo_refcnt > count) /* we needed one fd for the ioctl */
+ return -EBUSY;
+
+ if (filp == NULL)
+ return -EINVAL;
+
+ spin_lock_irq(&lo->lo_lock);
+ lo->lo_state = LLOOP_RUNDOWN;
+ spin_unlock_irq(&lo->lo_lock);
+ wake_up(&lo->lo_bh_wait);
+
+ down(&lo->lo_sem);
+ lo->lo_backing_file = NULL;
+ lo->lo_device = NULL;
+ lo->lo_offset = 0;
+ lo->lo_sizelimit = 0;
+ lo->lo_flags = 0;
+ invalidate_bdev(bdev);
+ set_capacity(disks[lo->lo_number], 0);
+ bd_set_size(bdev, 0);
+ mapping_set_gfp_mask(filp->f_mapping, gfp);
+ lo->lo_state = LLOOP_UNBOUND;
+ fput(filp);
+ /* This is safe: open() is still holding a reference. */
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+ struct lloop_device *lo = bdev->bd_disk->private_data;
+
+ mutex_lock(&lo->lo_ctl_mutex);
+ lo->lo_refcnt++;
+ mutex_unlock(&lo->lo_ctl_mutex);
+
+ return 0;
+}
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+ struct lloop_device *lo = disk->private_data;
+
+ mutex_lock(&lo->lo_ctl_mutex);
+ --lo->lo_refcnt;
+ mutex_unlock(&lo->lo_ctl_mutex);
+}
+
+/* lloop device node's ioctl function. */
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct lloop_device *lo = bdev->bd_disk->private_data;
+ struct inode *inode = NULL;
+ int err = 0;
+
+ mutex_lock(&lloop_mutex);
+ switch (cmd) {
+ case LL_IOC_LLOOP_DETACH: {
+ err = loop_clr_fd(lo, bdev, 2);
+ if (err == 0)
+ blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+ break;
+ }
+
+ case LL_IOC_LLOOP_INFO: {
+ struct lu_fid fid;
+
+ if (lo->lo_backing_file == NULL) {
+ err = -ENOENT;
+ break;
+ }
+ if (inode == NULL)
+ inode = file_inode(lo->lo_backing_file);
+ if (lo->lo_state == LLOOP_BOUND)
+ fid = ll_i2info(inode)->lli_fid;
+ else
+ fid_zero(&fid);
+
+ if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid)))
+ err = -EFAULT;
+ break;
+ }
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+ mutex_unlock(&lloop_mutex);
+
+ return err;
+}
+
+static struct block_device_operations lo_fops = {
+ .owner = THIS_MODULE,
+ .open = lo_open,
+ .release = lo_release,
+ .ioctl = lo_ioctl,
+};
+
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ *
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching or detaching a file by a lloop's device number.
+ */
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
+ unsigned int cmd, unsigned long arg,
+ void *magic, int *rcp)
+{
+ struct lloop_device *lo = NULL;
+ struct block_device *bdev = NULL;
+ int err = 0;
+ dev_t dev;
+
+ if (magic != ll_iocontrol_magic)
+ return LLIOC_CONT;
+
+ if (disks == NULL) {
+ err = -ENODEV;
+ goto out1;
+ }
+
+ CWARN("Enter llop_ioctl\n");
+
+ mutex_lock(&lloop_mutex);
+ switch (cmd) {
+ case LL_IOC_LLOOP_ATTACH: {
+ struct lloop_device *lo_free = NULL;
+ int i;
+
+ for (i = 0; i < max_loop; i++, lo = NULL) {
+ lo = &loop_dev[i];
+ if (lo->lo_state == LLOOP_UNBOUND) {
+ if (!lo_free)
+ lo_free = lo;
+ continue;
+ }
+ if (file_inode(lo->lo_backing_file) == file_inode(file))
+ break;
+ }
+ if (lo || !lo_free) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ lo = lo_free;
+ dev = MKDEV(lloop_major, lo->lo_number);
+
+ /* quit if the used pointer is writable */
+ if (put_user((long)old_encode_dev(dev), (long *)arg)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ bdev = blkdev_get_by_dev(dev, file->f_mode, NULL);
+ if (IS_ERR(bdev)) {
+ err = PTR_ERR(bdev);
+ goto out;
+ }
+
+ get_file(file);
+ err = loop_set_fd(lo, NULL, bdev, file);
+ if (err) {
+ fput(file);
+ blkdev_put(bdev, 0);
+ }
+
+ break;
+ }
+
+ case LL_IOC_LLOOP_DETACH_BYDEV: {
+ int minor;
+
+ dev = old_decode_dev(arg);
+ if (MAJOR(dev) != lloop_major) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ minor = MINOR(dev);
+ if (minor > max_loop - 1) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ lo = &loop_dev[minor];
+ if (lo->lo_state != LLOOP_BOUND) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ bdev = lo->lo_device;
+ err = loop_clr_fd(lo, bdev, 1);
+ if (err == 0)
+ blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+
+ break;
+ }
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+out:
+ mutex_unlock(&lloop_mutex);
+out1:
+ if (rcp)
+ *rcp = err;
+ return LLIOC_STOP;
+}
+
+static int __init lloop_init(void)
+{
+ int i;
+ unsigned int cmdlist[] = {
+ LL_IOC_LLOOP_ATTACH,
+ LL_IOC_LLOOP_DETACH_BYDEV,
+ };
+
+ if (max_loop < 1 || max_loop > 256) {
+ max_loop = MAX_LOOP_DEFAULT;
+ CWARN("lloop: invalid max_loop (must be between 1 and 256), using default (%u)\n",
+ max_loop);
+ }
+
+ lloop_major = register_blkdev(0, "lloop");
+ if (lloop_major < 0)
+ return -EIO;
+
+ CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+ lloop_major, max_loop);
+
+ ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
+ if (ll_iocontrol_magic == NULL)
+ goto out_mem1;
+
+ loop_dev = kcalloc(max_loop, sizeof(*loop_dev), GFP_KERNEL);
+ if (!loop_dev)
+ goto out_mem1;
+
+ disks = kcalloc(max_loop, sizeof(*disks), GFP_KERNEL);
+ if (!disks)
+ goto out_mem2;
+
+ for (i = 0; i < max_loop; i++) {
+ disks[i] = alloc_disk(1);
+ if (!disks[i])
+ goto out_mem3;
+ }
+
+ mutex_init(&lloop_mutex);
+
+ for (i = 0; i < max_loop; i++) {
+ struct lloop_device *lo = &loop_dev[i];
+ struct gendisk *disk = disks[i];
+
+ lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!lo->lo_queue)
+ goto out_mem4;
+
+ mutex_init(&lo->lo_ctl_mutex);
+ sema_init(&lo->lo_sem, 0);
+ init_waitqueue_head(&lo->lo_bh_wait);
+ lo->lo_number = i;
+ spin_lock_init(&lo->lo_lock);
+ disk->major = lloop_major;
+ disk->first_minor = i;
+ disk->fops = &lo_fops;
+ sprintf(disk->disk_name, "lloop%d", i);
+ disk->private_data = lo;
+ disk->queue = lo->lo_queue;
+ }
+
+ /* We cannot fail after we call this, so another loop!*/
+ for (i = 0; i < max_loop; i++)
+ add_disk(disks[i]);
+ return 0;
+
+out_mem4:
+ while (i--)
+ blk_cleanup_queue(loop_dev[i].lo_queue);
+ i = max_loop;
+out_mem3:
+ while (i--)
+ put_disk(disks[i]);
+ OBD_FREE(disks, max_loop * sizeof(*disks));
+out_mem2:
+ OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+out_mem1:
+ unregister_blkdev(lloop_major, "lloop");
+ ll_iocontrol_unregister(ll_iocontrol_magic);
+ CERROR("lloop: ran out of memory\n");
+ return -ENOMEM;
+}
+
+static void lloop_exit(void)
+{
+ int i;
+
+ ll_iocontrol_unregister(ll_iocontrol_magic);
+ for (i = 0; i < max_loop; i++) {
+ del_gendisk(disks[i]);
+ blk_cleanup_queue(loop_dev[i].lo_queue);
+ put_disk(disks[i]);
+ }
+
+ unregister_blkdev(lloop_major, "lloop");
+
+ OBD_FREE(disks, max_loop * sizeof(*disks));
+ OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+}
+
+module_init(lloop_init);
+module_exit(lloop_exit);
+
+module_param(max_loop, int, 0444);
+MODULE_PARM_DESC(max_loop, "maximum of lloop_device");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre virtual block device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c
new file mode 100644
index 000000000..83a9b8547
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c
@@ -0,0 +1,1536 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "../include/lprocfs_status.h"
+#include <linux/seq_file.h>
+#include "../include/obd_support.h"
+
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/* /proc/lustre/llite mount point registration */
+static struct file_operations ll_rw_extents_stats_fops;
+static struct file_operations ll_rw_extents_stats_pp_fops;
+static struct file_operations ll_rw_offset_stats_fops;
+
+static int ll_blksize_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+ struct obd_statfs osfs;
+ int rc;
+
+ LASSERT(sb != NULL);
+ rc = ll_statfs_internal(sb, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ seq_printf(m, "%u\n", osfs.os_bsize);
+
+ return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_blksize);
+
+static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+ struct obd_statfs osfs;
+ int rc;
+
+ LASSERT(sb != NULL);
+ rc = ll_statfs_internal(sb, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_blocks;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ seq_printf(m, "%llu\n", result);
+ }
+
+ return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytestotal);
+
+static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+ struct obd_statfs osfs;
+ int rc;
+
+ LASSERT(sb != NULL);
+ rc = ll_statfs_internal(sb, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bfree;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ seq_printf(m, "%llu\n", result);
+ }
+
+ return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesfree);
+
+static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+ struct obd_statfs osfs;
+ int rc;
+
+ LASSERT(sb != NULL);
+ rc = ll_statfs_internal(sb, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bavail;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ seq_printf(m, "%llu\n", result);
+ }
+
+ return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesavail);
+
+static int ll_filestotal_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+ struct obd_statfs osfs;
+ int rc;
+
+ LASSERT(sb != NULL);
+ rc = ll_statfs_internal(sb, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ seq_printf(m, "%llu\n", osfs.os_files);
+
+ return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filestotal);
+
+static int ll_filesfree_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+ struct obd_statfs osfs;
+ int rc;
+
+ LASSERT(sb != NULL);
+ rc = ll_statfs_internal(sb, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ seq_printf(m, "%llu\n", osfs.os_ffree);
+
+ return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filesfree);
+
+static int ll_client_type_seq_show(struct seq_file *m, void *v)
+{
+ struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+
+ LASSERT(sbi != NULL);
+
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+ seq_puts(m, "remote client\n");
+ else
+ seq_puts(m, "local client\n");
+
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_client_type);
+
+static int ll_fstype_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+
+ LASSERT(sb != NULL);
+ seq_printf(m, "%s\n", sb->s_type->name);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_fstype);
+
+static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = (struct super_block *)m->private;
+
+ LASSERT(sb != NULL);
+ seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sb_uuid);
+
+static int ll_site_stats_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+
+ /*
+ * See description of statistical counters in struct cl_site, and
+ * struct lu_site.
+ */
+ return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
+}
+LPROC_SEQ_FOPS_RO(ll_site_stats);
+
+static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ long pages_number;
+ int mult;
+
+ spin_lock(&sbi->ll_lock);
+ pages_number = sbi->ll_ra_info.ra_max_pages;
+ spin_unlock(&sbi->ll_lock);
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int mult, rc, pages_number;
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+ if (rc)
+ return rc;
+
+ if (pages_number < 0 || pages_number > totalram_pages / 2) {
+ CERROR("can't set file readahead more than %lu MB\n",
+ totalram_pages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
+ return -ERANGE;
+ }
+
+ spin_lock(&sbi->ll_lock);
+ sbi->ll_ra_info.ra_max_pages = pages_number;
+ spin_unlock(&sbi->ll_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_mb);
+
+static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ long pages_number;
+ int mult;
+
+ spin_lock(&sbi->ll_lock);
+ pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+ spin_unlock(&sbi->ll_lock);
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_per_file_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int mult, rc, pages_number;
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+ if (rc)
+ return rc;
+
+ if (pages_number < 0 ||
+ pages_number > sbi->ll_ra_info.ra_max_pages) {
+ CERROR("can't set file readahead more than max_read_ahead_mb %lu MB\n",
+ sbi->ll_ra_info.ra_max_pages);
+ return -ERANGE;
+ }
+
+ spin_lock(&sbi->ll_lock);
+ sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+ spin_unlock(&sbi->ll_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
+
+static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *unused)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ long pages_number;
+ int mult;
+
+ spin_lock(&sbi->ll_lock);
+ pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+ spin_unlock(&sbi->ll_lock);
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_read_ahead_whole_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int mult, rc, pages_number;
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+ if (rc)
+ return rc;
+
+ /* Cap this at the current max readahead window size, the readahead
+ * algorithm does this anyway so it's pointless to set it larger. */
+ if (pages_number < 0 ||
+ pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+ CERROR("can't set max_read_ahead_whole_mb more than max_read_ahead_per_file_mb: %lu\n",
+ sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT));
+ return -ERANGE;
+ }
+
+ spin_lock(&sbi->ll_lock);
+ sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+ spin_unlock(&sbi->ll_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+
+static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct cl_client_cache *cache = &sbi->ll_cache;
+ int shift = 20 - PAGE_CACHE_SHIFT;
+ int max_cached_mb;
+ int unused_mb;
+
+ max_cached_mb = cache->ccc_lru_max >> shift;
+ unused_mb = atomic_read(&cache->ccc_lru_left) >> shift;
+ seq_printf(m,
+ "users: %d\n"
+ "max_cached_mb: %d\n"
+ "used_mb: %d\n"
+ "unused_mb: %d\n"
+ "reclaim_count: %u\n",
+ atomic_read(&cache->ccc_users),
+ max_cached_mb,
+ max_cached_mb - unused_mb,
+ unused_mb,
+ cache->ccc_lru_shrinkers);
+ return 0;
+}
+
+static ssize_t ll_max_cached_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct cl_client_cache *cache = &sbi->ll_cache;
+ int mult, rc, pages_number;
+ int diff = 0;
+ int nrpages = 0;
+ char kernbuf[128];
+
+ if (count >= sizeof(kernbuf))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+ kernbuf[count] = 0;
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
+ kernbuf;
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+ if (rc)
+ return rc;
+
+ if (pages_number < 0 || pages_number > totalram_pages) {
+ CERROR("%s: can't set max cache more than %lu MB\n",
+ ll_get_fsname(sb, NULL, 0),
+ totalram_pages >> (20 - PAGE_CACHE_SHIFT));
+ return -ERANGE;
+ }
+
+ spin_lock(&sbi->ll_lock);
+ diff = pages_number - cache->ccc_lru_max;
+ spin_unlock(&sbi->ll_lock);
+
+ /* easy - add more LRU slots. */
+ if (diff >= 0) {
+ atomic_add(diff, &cache->ccc_lru_left);
+ rc = 0;
+ goto out;
+ }
+
+ diff = -diff;
+ while (diff > 0) {
+ int tmp;
+
+ /* reduce LRU budget from free slots. */
+ do {
+ int ov, nv;
+
+ ov = atomic_read(&cache->ccc_lru_left);
+ if (ov == 0)
+ break;
+
+ nv = ov > diff ? ov - diff : 0;
+ rc = atomic_cmpxchg(&cache->ccc_lru_left, ov, nv);
+ if (likely(ov == rc)) {
+ diff -= ov - nv;
+ nrpages += ov - nv;
+ break;
+ }
+ } while (1);
+
+ if (diff <= 0)
+ break;
+
+ if (sbi->ll_dt_exp == NULL) { /* being initialized */
+ rc = -ENODEV;
+ break;
+ }
+
+ /* difficult - have to ask OSCs to drop LRU slots. */
+ tmp = diff << 1;
+ rc = obd_set_info_async(NULL, sbi->ll_dt_exp,
+ sizeof(KEY_CACHE_LRU_SHRINK),
+ KEY_CACHE_LRU_SHRINK,
+ sizeof(tmp), &tmp, NULL);
+ if (rc < 0)
+ break;
+ }
+
+out:
+ if (rc >= 0) {
+ spin_lock(&sbi->ll_lock);
+ cache->ccc_lru_max = pages_number;
+ spin_unlock(&sbi->ll_lock);
+ rc = count;
+ } else {
+ atomic_add(nrpages, &cache->ccc_lru_left);
+ }
+ return rc;
+}
+LPROC_SEQ_FOPS(ll_max_cached_mb);
+
+static int ll_checksum_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+ return 0;
+}
+
+static ssize_t ll_checksum_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ if (!sbi->ll_dt_exp)
+ /* Not set up yet */
+ return -EAGAIN;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+ if (val)
+ sbi->ll_flags |= LL_SBI_CHECKSUM;
+ else
+ sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+
+ rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+ KEY_CHECKSUM, sizeof(val), &val, NULL);
+ if (rc)
+ CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_checksum);
+
+static int ll_max_rw_chunk_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+
+ seq_printf(m, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk);
+ return 0;
+}
+
+static ssize_t ll_max_rw_chunk_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ int rc, val;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+ ll_s2sbi(sb)->ll_max_rw_chunk = val;
+ return count;
+}
+LPROC_SEQ_FOPS(ll_max_rw_chunk);
+
+static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
+{
+ struct super_block *sb = m->private;
+
+ if (ll_s2sbi(sb)->ll_stats_track_type == type)
+ seq_printf(m, "%d\n", ll_s2sbi(sb)->ll_stats_track_id);
+ else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL)
+ seq_puts(m, "0 (all)\n");
+ else
+ seq_puts(m, "untracked\n");
+
+ return 0;
+}
+
+static int ll_wr_track_id(const char __user *buffer, unsigned long count,
+ void *data, enum stats_track_type type)
+{
+ struct super_block *sb = data;
+ int rc, pid;
+
+ rc = lprocfs_write_helper(buffer, count, &pid);
+ if (rc)
+ return rc;
+ ll_s2sbi(sb)->ll_stats_track_id = pid;
+ if (pid == 0)
+ ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+ else
+ ll_s2sbi(sb)->ll_stats_track_type = type;
+ lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+ return count;
+}
+
+static int ll_track_pid_seq_show(struct seq_file *m, void *v)
+{
+ return ll_rd_track_id(m, STATS_TRACK_PID);
+}
+
+static ssize_t ll_track_pid_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PID);
+}
+LPROC_SEQ_FOPS(ll_track_pid);
+
+static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
+{
+ return ll_rd_track_id(m, STATS_TRACK_PPID);
+}
+
+static ssize_t ll_track_ppid_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PPID);
+}
+LPROC_SEQ_FOPS(ll_track_ppid);
+
+static int ll_track_gid_seq_show(struct seq_file *m, void *v)
+{
+ return ll_rd_track_id(m, STATS_TRACK_GID);
+}
+
+static ssize_t ll_track_gid_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_GID);
+}
+LPROC_SEQ_FOPS(ll_track_gid);
+
+static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ seq_printf(m, "%u\n", sbi->ll_sa_max);
+ return 0;
+}
+
+static ssize_t ll_statahead_max_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val >= 0 && val <= LL_SA_RPC_MAX)
+ sbi->ll_sa_max = val;
+ else
+ CERROR("Bad statahead_max value %d. Valid values are in the range [0, %d]\n",
+ val, LL_SA_RPC_MAX);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_max);
+
+static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ seq_printf(m, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+ return 0;
+}
+
+static ssize_t ll_statahead_agl_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val)
+ sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+ else
+ sbi->ll_flags &= ~LL_SBI_AGL_ENABLED;
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_agl);
+
+static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ seq_printf(m,
+ "statahead total: %u\n"
+ "statahead wrong: %u\n"
+ "agl total: %u\n",
+ atomic_read(&sbi->ll_sa_total),
+ atomic_read(&sbi->ll_sa_wrong),
+ atomic_read(&sbi->ll_agl_total));
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_statahead_stats);
+
+static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ seq_printf(m, "%u\n", sbi->ll_flags & LL_SBI_LAZYSTATFS ? 1 : 0);
+ return 0;
+}
+
+static ssize_t ll_lazystatfs_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val)
+ sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+ else
+ sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_lazystatfs);
+
+static int ll_max_easize_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ unsigned int ealen;
+ int rc;
+
+ rc = ll_get_max_mdsize(sbi, &ealen);
+ if (rc)
+ return rc;
+
+ seq_printf(m, "%u\n", ealen);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_max_easize);
+
+static int ll_default_easize_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ unsigned int ealen;
+ int rc;
+
+ rc = ll_get_default_mdsize(sbi, &ealen);
+ if (rc)
+ return rc;
+
+ seq_printf(m, "%u\n", ealen);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_default_easize);
+
+static int ll_max_cookiesize_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ unsigned int cookielen;
+ int rc;
+
+ rc = ll_get_max_cookiesize(sbi, &cookielen);
+ if (rc)
+ return rc;
+
+ seq_printf(m, "%u\n", cookielen);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_max_cookiesize);
+
+static int ll_default_cookiesize_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ unsigned int cookielen;
+ int rc;
+
+ rc = ll_get_default_cookiesize(sbi, &cookielen);
+ if (rc)
+ return rc;
+
+ seq_printf(m, "%u\n", cookielen);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_default_cookiesize);
+
+static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
+{
+ const char *str[] = LL_SBI_FLAGS;
+ struct super_block *sb = m->private;
+ int flags = ll_s2sbi(sb)->ll_flags;
+ int i = 0;
+
+ while (flags != 0) {
+ if (ARRAY_SIZE(str) <= i) {
+ CERROR("%s: Revise array LL_SBI_FLAGS to match sbi flags please.\n",
+ ll_get_fsname(sb, NULL, 0));
+ return -EINVAL;
+ }
+
+ if (flags & 0x1)
+ seq_printf(m, "%s ", str[i]);
+ flags >>= 1;
+ ++i;
+ }
+ seq_printf(m, "\b\n");
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sbi_flags);
+
+static int ll_xattr_cache_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ seq_printf(m, "%u\n", sbi->ll_xattr_cache_enabled);
+
+ return 0;
+}
+
+static ssize_t ll_xattr_cache_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct super_block *sb = seq->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val != 0 && val != 1)
+ return -ERANGE;
+
+ if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE))
+ return -ENOTSUPP;
+
+ sbi->ll_xattr_cache_enabled = val;
+
+ return count;
+}
+LPROC_SEQ_FOPS(ll_xattr_cache);
+
+static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+ { "uuid", &ll_sb_uuid_fops, NULL, 0 },
+ /* { "mntpt_path", ll_rd_path, 0, 0 }, */
+ { "fstype", &ll_fstype_fops, NULL, 0 },
+ { "site", &ll_site_stats_fops, NULL, 0 },
+ { "blocksize", &ll_blksize_fops, NULL, 0 },
+ { "kbytestotal", &ll_kbytestotal_fops, NULL, 0 },
+ { "kbytesfree", &ll_kbytesfree_fops, NULL, 0 },
+ { "kbytesavail", &ll_kbytesavail_fops, NULL, 0 },
+ { "filestotal", &ll_filestotal_fops, NULL, 0 },
+ { "filesfree", &ll_filesfree_fops, NULL, 0 },
+ { "client_type", &ll_client_type_fops, NULL, 0 },
+ /* { "filegroups", lprocfs_rd_filegroups, 0, 0 }, */
+ { "max_read_ahead_mb", &ll_max_readahead_mb_fops, NULL },
+ { "max_read_ahead_per_file_mb", &ll_max_readahead_per_file_mb_fops,
+ NULL },
+ { "max_read_ahead_whole_mb", &ll_max_read_ahead_whole_mb_fops, NULL },
+ { "max_cached_mb", &ll_max_cached_mb_fops, NULL },
+ { "checksum_pages", &ll_checksum_fops, NULL },
+ { "max_rw_chunk", &ll_max_rw_chunk_fops, NULL },
+ { "stats_track_pid", &ll_track_pid_fops, NULL },
+ { "stats_track_ppid", &ll_track_ppid_fops, NULL },
+ { "stats_track_gid", &ll_track_gid_fops, NULL },
+ { "statahead_max", &ll_statahead_max_fops, NULL },
+ { "statahead_agl", &ll_statahead_agl_fops, NULL },
+ { "statahead_stats", &ll_statahead_stats_fops, NULL, 0 },
+ { "lazystatfs", &ll_lazystatfs_fops, NULL },
+ { "max_easize", &ll_max_easize_fops, NULL, 0 },
+ { "default_easize", &ll_default_easize_fops, NULL, 0 },
+ { "max_cookiesize", &ll_max_cookiesize_fops, NULL, 0 },
+ { "default_cookiesize", &ll_default_cookiesize_fops, NULL, 0 },
+ { "sbi_flags", &ll_sbi_flags_fops, NULL, 0 },
+ { "xattr_cache", &ll_xattr_cache_fops, NULL, 0 },
+ { NULL }
+};
+
+#define MAX_STRING_SIZE 128
+
+static const struct llite_file_opcode {
+ __u32 opcode;
+ __u32 type;
+ const char *opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+ /* file operation */
+ { LPROC_LL_DIRTY_HITS, LPROCFS_TYPE_REGS, "dirty_pages_hits" },
+ { LPROC_LL_DIRTY_MISSES, LPROCFS_TYPE_REGS, "dirty_pages_misses" },
+ { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+ "read_bytes" },
+ { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+ "write_bytes" },
+ { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+ "brw_read" },
+ { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+ "brw_write" },
+ { LPROC_LL_OSC_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+ "osc_read" },
+ { LPROC_LL_OSC_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+ "osc_write" },
+ { LPROC_LL_IOCTL, LPROCFS_TYPE_REGS, "ioctl" },
+ { LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" },
+ { LPROC_LL_RELEASE, LPROCFS_TYPE_REGS, "close" },
+ { LPROC_LL_MAP, LPROCFS_TYPE_REGS, "mmap" },
+ { LPROC_LL_LLSEEK, LPROCFS_TYPE_REGS, "seek" },
+ { LPROC_LL_FSYNC, LPROCFS_TYPE_REGS, "fsync" },
+ { LPROC_LL_READDIR, LPROCFS_TYPE_REGS, "readdir" },
+ /* inode operation */
+ { LPROC_LL_SETATTR, LPROCFS_TYPE_REGS, "setattr" },
+ { LPROC_LL_TRUNC, LPROCFS_TYPE_REGS, "truncate" },
+ { LPROC_LL_FLOCK, LPROCFS_TYPE_REGS, "flock" },
+ { LPROC_LL_GETATTR, LPROCFS_TYPE_REGS, "getattr" },
+ /* dir inode operation */
+ { LPROC_LL_CREATE, LPROCFS_TYPE_REGS, "create" },
+ { LPROC_LL_LINK, LPROCFS_TYPE_REGS, "link" },
+ { LPROC_LL_UNLINK, LPROCFS_TYPE_REGS, "unlink" },
+ { LPROC_LL_SYMLINK, LPROCFS_TYPE_REGS, "symlink" },
+ { LPROC_LL_MKDIR, LPROCFS_TYPE_REGS, "mkdir" },
+ { LPROC_LL_RMDIR, LPROCFS_TYPE_REGS, "rmdir" },
+ { LPROC_LL_MKNOD, LPROCFS_TYPE_REGS, "mknod" },
+ { LPROC_LL_RENAME, LPROCFS_TYPE_REGS, "rename" },
+ /* special inode operation */
+ { LPROC_LL_STAFS, LPROCFS_TYPE_REGS, "statfs" },
+ { LPROC_LL_ALLOC_INODE, LPROCFS_TYPE_REGS, "alloc_inode" },
+ { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" },
+ { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" },
+ { LPROC_LL_GETXATTR_HITS, LPROCFS_TYPE_REGS, "getxattr_hits" },
+ { LPROC_LL_LISTXATTR, LPROCFS_TYPE_REGS, "listxattr" },
+ { LPROC_LL_REMOVEXATTR, LPROCFS_TYPE_REGS, "removexattr" },
+ { LPROC_LL_INODE_PERM, LPROCFS_TYPE_REGS, "inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
+{
+ if (!sbi->ll_stats)
+ return;
+ if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+ lprocfs_counter_add(sbi->ll_stats, op, count);
+ else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+ sbi->ll_stats_track_id == current->pid)
+ lprocfs_counter_add(sbi->ll_stats, op, count);
+ else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+ sbi->ll_stats_track_id == current->real_parent->pid)
+ lprocfs_counter_add(sbi->ll_stats, op, count);
+ else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+ sbi->ll_stats_track_id ==
+ from_kgid(&init_user_ns, current_gid()))
+ lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *ra_stat_string[] = {
+ [RA_STAT_HIT] = "hits",
+ [RA_STAT_MISS] = "misses",
+ [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+ [RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+ [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+ [RA_STAT_FAILED_MATCH] = "failed lock match",
+ [RA_STAT_DISCARDED] = "read but discarded",
+ [RA_STAT_ZERO_LEN] = "zero length file",
+ [RA_STAT_ZERO_WINDOW] = "zero size window",
+ [RA_STAT_EOF] = "read-ahead to EOF",
+ [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+ [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(llite, name);
+LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
+
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+ struct super_block *sb, char *osc, char *mdc)
+{
+ struct lprocfs_vars lvars[2];
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct obd_device *obd;
+ struct proc_dir_entry *dir;
+ char name[MAX_STRING_SIZE + 1], *ptr;
+ int err, id, len, rc;
+
+ memset(lvars, 0, sizeof(lvars));
+
+ name[MAX_STRING_SIZE] = '\0';
+ lvars[0].name = name;
+
+ LASSERT(sbi != NULL);
+ LASSERT(mdc != NULL);
+ LASSERT(osc != NULL);
+
+ /* Get fsname */
+ len = strlen(lsi->lsi_lmd->lmd_profile);
+ ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+ if (ptr && (strcmp(ptr, "-client") == 0))
+ len -= 7;
+
+ /* Mount info */
+ snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+ lsi->lsi_lmd->lmd_profile, sb);
+
+ sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+ if (IS_ERR(sbi->ll_proc_root)) {
+ err = PTR_ERR(sbi->ll_proc_root);
+ sbi->ll_proc_root = NULL;
+ return err;
+ }
+
+ rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+ &vvp_dump_pgcache_file_ops, sbi);
+ if (rc)
+ CWARN("Error adding the dump_page_cache file\n");
+
+ rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+ &ll_rw_extents_stats_fops, sbi);
+ if (rc)
+ CWARN("Error adding the extent_stats file\n");
+
+ rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+ 0644, &ll_rw_extents_stats_pp_fops, sbi);
+ if (rc)
+ CWARN("Error adding the extents_stats_per_process file\n");
+
+ rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+ &ll_rw_offset_stats_fops, sbi);
+ if (rc)
+ CWARN("Error adding the offset_stats file\n");
+
+ /* File operations stats */
+ sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+ LPROCFS_STATS_FLAG_NONE);
+ if (sbi->ll_stats == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /* do counter init */
+ for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+ __u32 type = llite_opcode_table[id].type;
+ void *ptr = NULL;
+ if (type & LPROCFS_TYPE_REGS)
+ ptr = "regs";
+ else if (type & LPROCFS_TYPE_BYTES)
+ ptr = "bytes";
+ else if (type & LPROCFS_TYPE_PAGES)
+ ptr = "pages";
+ lprocfs_counter_init(sbi->ll_stats,
+ llite_opcode_table[id].opcode,
+ (type & LPROCFS_CNTR_AVGMINMAX),
+ llite_opcode_table[id].opname, ptr);
+ }
+ err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+ if (err)
+ goto out;
+
+ sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+ LPROCFS_STATS_FLAG_NONE);
+ if (sbi->ll_ra_stats == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+ lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+ ra_stat_string[id], "pages");
+ err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+ sbi->ll_ra_stats);
+ if (err)
+ goto out;
+
+
+ err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+ if (err)
+ goto out;
+
+ /* MDC info */
+ obd = class_name2obd(mdc);
+
+ LASSERT(obd != NULL);
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ LASSERT(obd->obd_type->typ_name != NULL);
+
+ dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+ if (dir == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ snprintf(name, MAX_STRING_SIZE, "common_name");
+ lvars[0].fops = &llite_name_fops;
+ err = lprocfs_add_vars(dir, lvars, obd);
+ if (err)
+ goto out;
+
+ snprintf(name, MAX_STRING_SIZE, "uuid");
+ lvars[0].fops = &llite_uuid_fops;
+ err = lprocfs_add_vars(dir, lvars, obd);
+ if (err)
+ goto out;
+
+ /* OSC */
+ obd = class_name2obd(osc);
+
+ LASSERT(obd != NULL);
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ LASSERT(obd->obd_type->typ_name != NULL);
+
+ dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+ if (dir == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ snprintf(name, MAX_STRING_SIZE, "common_name");
+ lvars[0].fops = &llite_name_fops;
+ err = lprocfs_add_vars(dir, lvars, obd);
+ if (err)
+ goto out;
+
+ snprintf(name, MAX_STRING_SIZE, "uuid");
+ lvars[0].fops = &llite_uuid_fops;
+ err = lprocfs_add_vars(dir, lvars, obd);
+out:
+ if (err) {
+ lprocfs_remove(&sbi->ll_proc_root);
+ lprocfs_free_stats(&sbi->ll_ra_stats);
+ lprocfs_free_stats(&sbi->ll_stats);
+ }
+ return err;
+}
+
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+ if (sbi->ll_proc_root) {
+ lprocfs_remove(&sbi->ll_proc_root);
+ lprocfs_free_stats(&sbi->ll_ra_stats);
+ lprocfs_free_stats(&sbi->ll_stats);
+ }
+}
+#undef MAX_STRING_SIZE
+
+#define pct(a, b) (b ? a * 100 / b : 0)
+
+static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
+ struct seq_file *seq, int which)
+{
+ unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+ unsigned long start, end, r, w;
+ char *unitp = "KMGTPEZY";
+ int i, units = 10;
+ struct per_process_info *pp_info = &io_extents->pp_extents[which];
+
+ read_cum = 0;
+ write_cum = 0;
+ start = 0;
+
+ for (i = 0; i < LL_HIST_MAX; i++) {
+ read_tot += pp_info->pp_r_hist.oh_buckets[i];
+ write_tot += pp_info->pp_w_hist.oh_buckets[i];
+ }
+
+ for (i = 0; i < LL_HIST_MAX; i++) {
+ r = pp_info->pp_r_hist.oh_buckets[i];
+ w = pp_info->pp_w_hist.oh_buckets[i];
+ read_cum += r;
+ write_cum += w;
+ end = 1 << (i + LL_HIST_START - units);
+ seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu | %14lu %4lu %4lu\n",
+ start, *unitp, end, *unitp,
+ (i == LL_HIST_MAX - 1) ? '+' : ' ',
+ r, pct(r, read_tot), pct(read_cum, read_tot),
+ w, pct(w, write_tot), pct(write_cum, write_tot));
+ start = end;
+ if (start == 1<<10) {
+ start = 1;
+ units += 10;
+ unitp++;
+ }
+ if (read_cum == read_tot && write_cum == write_tot)
+ break;
+ }
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+ struct timeval now;
+ struct ll_sb_info *sbi = seq->private;
+ struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+ int k;
+
+ do_gettimeofday(&now);
+
+ if (!sbi->ll_rw_stats_on) {
+ seq_printf(seq, "disabled\n"
+ "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n");
+ return 0;
+ }
+ seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n",
+ now.tv_sec, (unsigned long)now.tv_usec);
+ seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write");
+ seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n",
+ "extents", "calls", "%", "cum%",
+ "calls", "%", "cum%");
+ spin_lock(&sbi->ll_pp_extent_lock);
+ for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+ if (io_extents->pp_extents[k].pid != 0) {
+ seq_printf(seq, "\nPID: %d\n",
+ io_extents->pp_extents[k].pid);
+ ll_display_extents_info(io_extents, seq, k);
+ }
+ }
+ spin_unlock(&sbi->ll_pp_extent_lock);
+ return 0;
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len,
+ loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct ll_sb_info *sbi = seq->private;
+ struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+ int i;
+ int value = 1, rc = 0;
+
+ if (len == 0)
+ return -EINVAL;
+
+ rc = lprocfs_write_helper(buf, len, &value);
+ if (rc < 0 && len < 16) {
+ char kernbuf[16];
+
+ if (copy_from_user(kernbuf, buf, len))
+ return -EFAULT;
+ kernbuf[len] = 0;
+
+ if (kernbuf[len - 1] == '\n')
+ kernbuf[len - 1] = 0;
+
+ if (strcmp(kernbuf, "disabled") == 0 ||
+ strcmp(kernbuf, "Disabled") == 0)
+ value = 0;
+ }
+
+ if (value == 0)
+ sbi->ll_rw_stats_on = 0;
+ else
+ sbi->ll_rw_stats_on = 1;
+
+ spin_lock(&sbi->ll_pp_extent_lock);
+ for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+ io_extents->pp_extents[i].pid = 0;
+ lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+ lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+ }
+ spin_unlock(&sbi->ll_pp_extent_lock);
+ return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+ struct timeval now;
+ struct ll_sb_info *sbi = seq->private;
+ struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+ do_gettimeofday(&now);
+
+ if (!sbi->ll_rw_stats_on) {
+ seq_printf(seq, "disabled\n"
+ "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n");
+ return 0;
+ }
+ seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n",
+ now.tv_sec, (unsigned long)now.tv_usec);
+
+ seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write");
+ seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n",
+ "extents", "calls", "%", "cum%",
+ "calls", "%", "cum%");
+ spin_lock(&sbi->ll_lock);
+ ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX);
+ spin_unlock(&sbi->ll_lock);
+
+ return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct ll_sb_info *sbi = seq->private;
+ struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+ int i;
+ int value = 1, rc = 0;
+
+ if (len == 0)
+ return -EINVAL;
+
+ rc = lprocfs_write_helper(buf, len, &value);
+ if (rc < 0 && len < 16) {
+ char kernbuf[16];
+
+ if (copy_from_user(kernbuf, buf, len))
+ return -EFAULT;
+ kernbuf[len] = 0;
+
+ if (kernbuf[len - 1] == '\n')
+ kernbuf[len - 1] = 0;
+
+ if (strcmp(kernbuf, "disabled") == 0 ||
+ strcmp(kernbuf, "Disabled") == 0)
+ value = 0;
+ }
+
+ if (value == 0)
+ sbi->ll_rw_stats_on = 0;
+ else
+ sbi->ll_rw_stats_on = 1;
+
+ spin_lock(&sbi->ll_pp_extent_lock);
+ for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+ io_extents->pp_extents[i].pid = 0;
+ lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+ lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+ }
+ spin_unlock(&sbi->ll_pp_extent_lock);
+
+ return len;
+}
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+ struct ll_file_data *file, loff_t pos,
+ size_t count, int rw)
+{
+ int i, cur = -1;
+ struct ll_rw_process_info *process;
+ struct ll_rw_process_info *offset;
+ int *off_count = &sbi->ll_rw_offset_entry_count;
+ int *process_count = &sbi->ll_offset_process_count;
+ struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+ if (!sbi->ll_rw_stats_on)
+ return;
+ process = sbi->ll_rw_process_info;
+ offset = sbi->ll_rw_offset_info;
+
+ spin_lock(&sbi->ll_pp_extent_lock);
+ /* Extent statistics */
+ for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+ if (io_extents->pp_extents[i].pid == pid) {
+ cur = i;
+ break;
+ }
+ }
+
+ if (cur == -1) {
+ /* new process */
+ sbi->ll_extent_process_count =
+ (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+ cur = sbi->ll_extent_process_count;
+ io_extents->pp_extents[cur].pid = pid;
+ lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
+ lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
+ }
+
+ for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+ (i < (LL_HIST_MAX - 1)); i++);
+ if (rw == 0) {
+ io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+ io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+ } else {
+ io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+ io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+ }
+ spin_unlock(&sbi->ll_pp_extent_lock);
+
+ spin_lock(&sbi->ll_process_lock);
+ /* Offset statistics */
+ for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+ if (process[i].rw_pid == pid) {
+ if (process[i].rw_last_file != file) {
+ process[i].rw_range_start = pos;
+ process[i].rw_last_file_pos = pos + count;
+ process[i].rw_smallest_extent = count;
+ process[i].rw_largest_extent = count;
+ process[i].rw_offset = 0;
+ process[i].rw_last_file = file;
+ spin_unlock(&sbi->ll_process_lock);
+ return;
+ }
+ if (process[i].rw_last_file_pos != pos) {
+ *off_count =
+ (*off_count + 1) % LL_OFFSET_HIST_MAX;
+ offset[*off_count].rw_op = process[i].rw_op;
+ offset[*off_count].rw_pid = pid;
+ offset[*off_count].rw_range_start =
+ process[i].rw_range_start;
+ offset[*off_count].rw_range_end =
+ process[i].rw_last_file_pos;
+ offset[*off_count].rw_smallest_extent =
+ process[i].rw_smallest_extent;
+ offset[*off_count].rw_largest_extent =
+ process[i].rw_largest_extent;
+ offset[*off_count].rw_offset =
+ process[i].rw_offset;
+ process[i].rw_op = rw;
+ process[i].rw_range_start = pos;
+ process[i].rw_smallest_extent = count;
+ process[i].rw_largest_extent = count;
+ process[i].rw_offset = pos -
+ process[i].rw_last_file_pos;
+ }
+ if (process[i].rw_smallest_extent > count)
+ process[i].rw_smallest_extent = count;
+ if (process[i].rw_largest_extent < count)
+ process[i].rw_largest_extent = count;
+ process[i].rw_last_file_pos = pos + count;
+ spin_unlock(&sbi->ll_process_lock);
+ return;
+ }
+ }
+ *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+ process[*process_count].rw_pid = pid;
+ process[*process_count].rw_op = rw;
+ process[*process_count].rw_range_start = pos;
+ process[*process_count].rw_last_file_pos = pos + count;
+ process[*process_count].rw_smallest_extent = count;
+ process[*process_count].rw_largest_extent = count;
+ process[*process_count].rw_offset = 0;
+ process[*process_count].rw_last_file = file;
+ spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+ struct timeval now;
+ struct ll_sb_info *sbi = seq->private;
+ struct ll_rw_process_info *offset = sbi->ll_rw_offset_info;
+ struct ll_rw_process_info *process = sbi->ll_rw_process_info;
+ int i;
+
+ do_gettimeofday(&now);
+
+ if (!sbi->ll_rw_stats_on) {
+ seq_printf(seq, "disabled\n"
+ "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n");
+ return 0;
+ }
+ spin_lock(&sbi->ll_process_lock);
+
+ seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n",
+ now.tv_sec, (unsigned long)now.tv_usec);
+ seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+ "R/W", "PID", "RANGE START", "RANGE END",
+ "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+ /* We stored the discontiguous offsets here; print them first */
+ for (i = 0; i < LL_OFFSET_HIST_MAX; i++) {
+ if (offset[i].rw_pid != 0)
+ seq_printf(seq,
+ "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+ offset[i].rw_op == READ ? 'R' : 'W',
+ offset[i].rw_pid,
+ offset[i].rw_range_start,
+ offset[i].rw_range_end,
+ (unsigned long)offset[i].rw_smallest_extent,
+ (unsigned long)offset[i].rw_largest_extent,
+ offset[i].rw_offset);
+ }
+ /* Then print the current offsets for each process */
+ for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+ if (process[i].rw_pid != 0)
+ seq_printf(seq,
+ "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+ process[i].rw_op == READ ? 'R' : 'W',
+ process[i].rw_pid,
+ process[i].rw_range_start,
+ process[i].rw_last_file_pos,
+ (unsigned long)process[i].rw_smallest_extent,
+ (unsigned long)process[i].rw_largest_extent,
+ process[i].rw_offset);
+ }
+ spin_unlock(&sbi->ll_process_lock);
+
+ return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct ll_sb_info *sbi = seq->private;
+ struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
+ struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+ int value = 1, rc = 0;
+
+ if (len == 0)
+ return -EINVAL;
+
+ rc = lprocfs_write_helper(buf, len, &value);
+
+ if (rc < 0 && len < 16) {
+ char kernbuf[16];
+
+ if (copy_from_user(kernbuf, buf, len))
+ return -EFAULT;
+ kernbuf[len] = 0;
+
+ if (kernbuf[len - 1] == '\n')
+ kernbuf[len - 1] = 0;
+
+ if (strcmp(kernbuf, "disabled") == 0 ||
+ strcmp(kernbuf, "Disabled") == 0)
+ value = 0;
+ }
+
+ if (value == 0)
+ sbi->ll_rw_stats_on = 0;
+ else
+ sbi->ll_rw_stats_on = 1;
+
+ spin_lock(&sbi->ll_process_lock);
+ sbi->ll_offset_process_count = 0;
+ sbi->ll_rw_offset_entry_count = 0;
+ memset(process_info, 0, sizeof(struct ll_rw_process_info) *
+ LL_PROCESS_HIST_MAX);
+ memset(offset_info, 0, sizeof(struct ll_rw_process_info) *
+ LL_OFFSET_HIST_MAX);
+ spin_unlock(&sbi->ll_process_lock);
+
+ return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = NULL;
+ lvars->obd_vars = lprocfs_llite_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c
new file mode 100644
index 000000000..5a25dcd10
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/namei.c
@@ -0,0 +1,1178 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "llite_internal.h"
+
+static int ll_create_it(struct inode *, struct dentry *,
+ int, struct lookup_intent *);
+
+/* called from iget5_locked->find_inode() under inode_hash_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct lustre_md *md = opaque;
+
+ if (unlikely(!(md->body->valid & OBD_MD_FLID))) {
+ CERROR("MDS body missing FID\n");
+ return 0;
+ }
+
+ if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1))
+ return 0;
+
+ return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+ if (unlikely(!(body->valid & OBD_MD_FLID))) {
+ CERROR("MDS body missing FID\n");
+ return -EINVAL;
+ }
+
+ lli->lli_fid = body->fid1;
+ if (unlikely(!(body->valid & OBD_MD_FLTYPE))) {
+ CERROR("Can not initialize inode " DFID
+ " without object type: valid = %#llx\n",
+ PFID(&lli->lli_fid), body->valid);
+ return -EINVAL;
+ }
+
+ inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT);
+ if (unlikely(inode->i_mode == 0)) {
+ CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+ return -EINVAL;
+ }
+
+ ll_lli_init(lli);
+
+ return 0;
+}
+
+
+/*
+ * Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+ struct lustre_md *md)
+{
+ struct inode *inode;
+
+ LASSERT(hash != 0);
+ inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+
+ if (inode) {
+ if (inode->i_state & I_NEW) {
+ int rc = 0;
+
+ ll_read_inode2(inode, md);
+ if (S_ISREG(inode->i_mode) &&
+ ll_i2info(inode)->lli_clob == NULL) {
+ CDEBUG(D_INODE,
+ "%s: apply lsm %p to inode "DFID".\n",
+ ll_get_fsname(sb, NULL, 0), md->lsm,
+ PFID(ll_inode2fid(inode)));
+ rc = cl_file_inode_init(inode, md);
+ }
+ if (rc != 0) {
+ make_bad_inode(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ inode = ERR_PTR(rc);
+ } else
+ unlock_new_inode(inode);
+ } else if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+ ll_update_inode(inode, md);
+ CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n",
+ inode, PFID(&md->body->fid1));
+ }
+ return inode;
+}
+
+static void ll_invalidate_negative_children(struct inode *dir)
+{
+ struct dentry *dentry, *tmp_subdir;
+ struct ll_d_hlist_node *p;
+
+ ll_lock_dcache(dir);
+ ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_u.d_alias) {
+ spin_lock(&dentry->d_lock);
+ if (!list_empty(&dentry->d_subdirs)) {
+ struct dentry *child;
+
+ list_for_each_entry_safe(child, tmp_subdir,
+ &dentry->d_subdirs,
+ d_child) {
+ if (d_really_is_negative(child))
+ d_lustre_invalidate(child, 1);
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+ }
+ ll_unlock_dcache(dir);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+ void *data, int flag)
+{
+ struct lustre_handle lockh;
+ int rc;
+
+ switch (flag) {
+ case LDLM_CB_BLOCKING:
+ ldlm_lock2handle(lock, &lockh);
+ rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+ if (rc < 0) {
+ CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+ return rc;
+ }
+ break;
+ case LDLM_CB_CANCELING: {
+ struct inode *inode = ll_inode_from_resource_lock(lock);
+ __u64 bits = lock->l_policy_data.l_inodebits.bits;
+
+ /* Inode is set to lock->l_resource->lr_lvb_inode
+ * for mdc - bug 24555 */
+ LASSERT(lock->l_ast_data == NULL);
+
+ if (inode == NULL)
+ break;
+
+ /* Invalidate all dentries associated with this inode */
+ LASSERT(lock->l_flags & LDLM_FL_CANCELING);
+
+ if (!fid_res_name_eq(ll_inode2fid(inode),
+ &lock->l_resource->lr_name)) {
+ LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+ PFID(ll_inode2fid(inode)), inode);
+ LBUG();
+ }
+
+ if (bits & MDS_INODELOCK_XATTR) {
+ ll_xattr_cache_destroy(inode);
+ bits &= ~MDS_INODELOCK_XATTR;
+ }
+
+ /* For OPEN locks we differentiate between lock modes
+ * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+ if (bits & MDS_INODELOCK_OPEN)
+ ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+ if (bits & MDS_INODELOCK_OPEN) {
+ fmode_t fmode;
+
+ switch (lock->l_req_mode) {
+ case LCK_CW:
+ fmode = FMODE_WRITE;
+ break;
+ case LCK_PR:
+ fmode = FMODE_EXEC;
+ break;
+ case LCK_CR:
+ fmode = FMODE_READ;
+ break;
+ default:
+ LDLM_ERROR(lock, "bad lock mode for OPEN lock");
+ LBUG();
+ }
+
+ ll_md_real_close(inode, fmode);
+ }
+
+ if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+ MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+ ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+ if (bits & MDS_INODELOCK_LAYOUT) {
+ struct cl_object_conf conf = {
+ .coc_opc = OBJECT_CONF_INVALIDATE,
+ .coc_inode = inode,
+ };
+
+ rc = ll_layout_conf(inode, &conf);
+ if (rc < 0)
+ CDEBUG(D_INODE, "cannot invalidate layout of "
+ DFID": rc = %d\n",
+ PFID(ll_inode2fid(inode)), rc);
+ }
+
+ if (bits & MDS_INODELOCK_UPDATE) {
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+ spin_unlock(&lli->lli_lock);
+ }
+
+ if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+ CDEBUG(D_INODE, "invalidating inode %lu\n",
+ inode->i_ino);
+ truncate_inode_pages(inode->i_mapping, 0);
+ ll_invalidate_negative_children(inode);
+ }
+
+ if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+ inode->i_sb->s_root != NULL &&
+ !is_root_inode(inode))
+ ll_invalidate_aliases(inode);
+
+ iput(inode);
+ break;
+ }
+ default:
+ LBUG();
+ }
+
+ return 0;
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+ if (in_group_p(i->i_gid))
+ return (__u32)from_kgid(&init_user_ns, i->i_gid);
+ else
+ return (__u32)(-1);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful. Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+#if 0
+ int i;
+#endif
+
+ LASSERT(i1 != NULL);
+ LASSERT(suppgids != NULL);
+
+ suppgids[0] = ll_i2suppgid(i1);
+
+ if (i2)
+ suppgids[1] = ll_i2suppgid(i2);
+ else
+ suppgids[1] = -1;
+
+#if 0
+ for (i = 0; i < current_ngroups; i++) {
+ if (suppgids[0] == -1) {
+ if (current_groups[i] != suppgids[1])
+ suppgids[0] = current_groups[i];
+ continue;
+ }
+ if (suppgids[1] == -1) {
+ if (current_groups[i] != suppgids[0])
+ suppgids[1] = current_groups[i];
+ continue;
+ }
+ break;
+ }
+#endif
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ * by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ * be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+ struct dentry *alias, *discon_alias, *invalid_alias;
+ struct ll_d_hlist_node *p;
+
+ if (ll_d_hlist_empty(&inode->i_dentry))
+ return NULL;
+
+ discon_alias = invalid_alias = NULL;
+
+ ll_lock_dcache(inode);
+ ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_u.d_alias) {
+ LASSERT(alias != dentry);
+
+ spin_lock(&alias->d_lock);
+ if (alias->d_flags & DCACHE_DISCONNECTED)
+ /* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+ discon_alias = alias;
+ else if (alias->d_parent == dentry->d_parent &&
+ alias->d_name.hash == dentry->d_name.hash &&
+ alias->d_name.len == dentry->d_name.len &&
+ memcmp(alias->d_name.name, dentry->d_name.name,
+ dentry->d_name.len) == 0)
+ invalid_alias = alias;
+ spin_unlock(&alias->d_lock);
+
+ if (invalid_alias)
+ break;
+ }
+ alias = invalid_alias ?: discon_alias ?: NULL;
+ if (alias) {
+ spin_lock(&alias->d_lock);
+ dget_dlock(alias);
+ spin_unlock(&alias->d_lock);
+ }
+ ll_unlock_dcache(inode);
+
+ return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+ struct dentry *new;
+ int rc;
+
+ if (inode) {
+ new = ll_find_alias(inode, de);
+ if (new) {
+ rc = ll_d_init(new);
+ if (rc < 0) {
+ dput(new);
+ return ERR_PTR(rc);
+ }
+ d_move(new, de);
+ iput(inode);
+ CDEBUG(D_DENTRY,
+ "Reuse dentry %p inode %p refc %d flags %#x\n",
+ new, d_inode(new), d_count(new), new->d_flags);
+ return new;
+ }
+ }
+ rc = ll_d_init(de);
+ if (rc < 0)
+ return ERR_PTR(rc);
+ d_add(de, inode);
+ CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+ de, d_inode(de), d_count(de), de->d_flags);
+ return de;
+}
+
+static int ll_lookup_it_finish(struct ptlrpc_request *request,
+ struct lookup_intent *it,
+ struct inode *parent, struct dentry **de)
+{
+ struct inode *inode = NULL;
+ __u64 bits = 0;
+ int rc;
+
+ /* NB 1 request reference will be taken away by ll_intent_lock()
+ * when I return */
+ CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+ it->d.lustre.it_disposition);
+ if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+ rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+ if (rc)
+ return rc;
+
+ ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+ /* We used to query real size from OSTs here, but actually
+ this is not needed. For stat() calls size would be updated
+ from subsequent do_revalidate()->ll_inode_revalidate_it() in
+ 2.4 and
+ vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+ Everybody else who needs correct file size would call
+ ll_glimpse_size or some equivalent themselves anyway.
+ Also see bug 7198. */
+ }
+
+ /* Only hash *de if it is unhashed (new dentry).
+ * Atoimc_open may passing hashed dentries for open.
+ */
+ if (d_unhashed(*de)) {
+ struct dentry *alias;
+
+ alias = ll_splice_alias(inode, *de);
+ if (IS_ERR(alias))
+ return PTR_ERR(alias);
+ *de = alias;
+ } else if (!it_disposition(it, DISP_LOOKUP_NEG) &&
+ !it_disposition(it, DISP_OPEN_CREATE)) {
+ /* With DISP_OPEN_CREATE dentry will
+ instantiated in ll_create_it. */
+ LASSERT(d_inode(*de) == NULL);
+ d_instantiate(*de, inode);
+ }
+
+ if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+ /* we have lookup look - unhide dentry */
+ if (bits & MDS_INODELOCK_LOOKUP)
+ d_lustre_revalidate(*de);
+ } else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+ /* If file created on server, don't depend on parent UPDATE
+ * lock to unhide it. It is left hidden and next lookup can
+ * find it in ll_splice_alias.
+ */
+ /* Check that parent has UPDATE lock. */
+ struct lookup_intent parent_it = {
+ .it_op = IT_GETATTR,
+ .d.lustre.it_lock_handle = 0 };
+
+ if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it,
+ &ll_i2info(parent)->lli_fid, NULL)) {
+ d_lustre_revalidate(*de);
+ ll_intent_release(&parent_it);
+ }
+ }
+
+ return 0;
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+ struct lookup_intent *it, int lookup_flags)
+{
+ struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+ struct dentry *save = dentry, *retval;
+ struct ptlrpc_request *req = NULL;
+ struct inode *inode;
+ struct md_op_data *op_data;
+ __u32 opc;
+ int rc;
+
+ if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),intent=%s\n",
+ dentry, parent->i_ino,
+ parent->i_generation, parent, LL_IT2STR(it));
+
+ if (d_mountpoint(dentry))
+ CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+ if (it == NULL || it->it_op == IT_GETXATTR)
+ it = &lookup_it;
+
+ if (it->it_op == IT_GETATTR) {
+ rc = ll_statahead_enter(parent, &dentry, 0);
+ if (rc == 1) {
+ if (dentry == save)
+ retval = NULL;
+ else
+ retval = dentry;
+ goto out;
+ }
+ }
+
+ if (it->it_op & IT_CREAT)
+ opc = LUSTRE_OPC_CREATE;
+ else
+ opc = LUSTRE_OPC_ANY;
+
+ op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
+ dentry->d_name.len, lookup_flags, opc,
+ NULL);
+ if (IS_ERR(op_data))
+ return (void *)op_data;
+
+ /* enforce umask if acl disabled or MDS doesn't support umask */
+ if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+ it->it_create_mode &= ~current_umask();
+
+ rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it,
+ lookup_flags, &req, ll_md_blocking_ast, 0);
+ ll_finish_md_op_data(op_data);
+ if (rc < 0) {
+ retval = ERR_PTR(rc);
+ goto out;
+ }
+
+ rc = ll_lookup_it_finish(req, it, parent, &dentry);
+ if (rc != 0) {
+ ll_intent_release(it);
+ retval = ERR_PTR(rc);
+ goto out;
+ }
+
+ inode = d_inode(dentry);
+ if ((it->it_op & IT_OPEN) && inode &&
+ !S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode)) {
+ ll_release_openhandle(inode, it);
+ }
+ ll_lookup_finish_locks(it, inode);
+
+ if (dentry == save)
+ retval = NULL;
+ else
+ retval = dentry;
+ goto out;
+ out:
+ if (req)
+ ptlrpc_req_finished(req);
+ if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry))
+ ll_statahead_mark(parent, dentry);
+ return retval;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+ struct dentry *de;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),flags=%u\n",
+ dentry, parent->i_ino,
+ parent->i_generation, parent, flags);
+
+ /* Optimize away (CREATE && !OPEN). Let .create handle the race. */
+ if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN))
+ return NULL;
+
+ if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+ itp = NULL;
+ else
+ itp = &it;
+ de = ll_lookup_it(parent, dentry, itp, 0);
+
+ if (itp != NULL)
+ ll_intent_release(itp);
+
+ return de;
+}
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, unsigned open_flags,
+ umode_t mode, int *opened)
+{
+ struct lookup_intent *it;
+ struct dentry *de;
+ long long lookup_flags = LOOKUP_OPEN;
+ int rc = 0;
+
+ CDEBUG(D_VFSTRACE,
+ "VFS Op:name=%pd,dir=%lu/%u(%p),file %p,open_flags %x,mode %x opened %d\n",
+ dentry, dir->i_ino,
+ dir->i_generation, dir, file, open_flags, mode, *opened);
+
+ it = kzalloc(sizeof(*it), GFP_NOFS);
+ if (!it)
+ return -ENOMEM;
+
+ it->it_op = IT_OPEN;
+ if (open_flags & O_CREAT) {
+ it->it_op |= IT_CREAT;
+ lookup_flags |= LOOKUP_CREATE;
+ }
+ it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+ it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+
+ /* Dentry added to dcache tree in ll_lookup_it */
+ de = ll_lookup_it(dir, dentry, it, lookup_flags);
+ if (IS_ERR(de))
+ rc = PTR_ERR(de);
+ else if (de != NULL)
+ dentry = de;
+
+ if (!rc) {
+ if (it_disposition(it, DISP_OPEN_CREATE)) {
+ /* Dentry instantiated in ll_create_it. */
+ rc = ll_create_it(dir, dentry, mode, it);
+ if (rc) {
+ /* We dget in ll_splice_alias. */
+ if (de != NULL)
+ dput(de);
+ goto out_release;
+ }
+
+ *opened |= FILE_CREATED;
+ }
+ if (d_really_is_positive(dentry) && it_disposition(it, DISP_OPEN_OPEN)) {
+ /* Open dentry. */
+ if (S_ISFIFO(d_inode(dentry)->i_mode)) {
+ /* We cannot call open here as it would
+ * deadlock.
+ */
+ if (it_disposition(it, DISP_ENQ_OPEN_REF))
+ ptlrpc_req_finished(
+ (struct ptlrpc_request *)
+ it->d.lustre.it_data);
+ rc = finish_no_open(file, de);
+ } else {
+ file->private_data = it;
+ rc = finish_open(file, dentry, NULL, opened);
+ /* We dget in ll_splice_alias. finish_open takes
+ * care of dget for fd open.
+ */
+ if (de != NULL)
+ dput(de);
+ }
+ } else {
+ rc = finish_no_open(file, de);
+ }
+ }
+
+out_release:
+ ll_intent_release(it);
+ OBD_FREE(it, sizeof(*it));
+
+ return rc;
+}
+
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it)
+{
+ struct inode *inode = NULL;
+ struct ptlrpc_request *request = NULL;
+ struct ll_sb_info *sbi = ll_i2sbi(dir);
+ int rc;
+
+ LASSERT(it && it->d.lustre.it_disposition);
+
+ LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+ request = it->d.lustre.it_data;
+ it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+ rc = ll_prep_inode(&inode, request, dir->i_sb, it);
+ if (rc) {
+ inode = ERR_PTR(rc);
+ goto out;
+ }
+
+ LASSERT(ll_d_hlist_empty(&inode->i_dentry));
+
+ /* We asked for a lock on the directory, but were granted a
+ * lock on the inode. Since we finally have an inode pointer,
+ * stuff it in the lock. */
+ CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
+ inode, inode->i_ino, inode->i_generation);
+ ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+ out:
+ ptlrpc_req_finished(request);
+ return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS. If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
+ struct lookup_intent *it)
+{
+ struct inode *inode;
+ int rc = 0;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),intent=%s\n",
+ dentry, dir->i_ino,
+ dir->i_generation, dir, LL_IT2STR(it));
+
+ rc = it_open_error(DISP_OPEN_CREATE, it);
+ if (rc)
+ return rc;
+
+ inode = ll_create_node(dir, it);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+static void ll_update_times(struct ptlrpc_request *request,
+ struct inode *inode)
+{
+ struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+ &RMF_MDT_BODY);
+
+ LASSERT(body);
+ if (body->valid & OBD_MD_FLMTIME &&
+ body->mtime > LTIME_S(inode->i_mtime)) {
+ CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n",
+ inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
+ LTIME_S(inode->i_mtime) = body->mtime;
+ }
+ if (body->valid & OBD_MD_FLCTIME &&
+ body->ctime > LTIME_S(inode->i_ctime))
+ LTIME_S(inode->i_ctime) = body->ctime;
+}
+
+static int ll_new_node(struct inode *dir, struct dentry *dentry,
+ const char *tgt, int mode, int rdev,
+ __u32 opc)
+{
+ struct ptlrpc_request *request = NULL;
+ struct md_op_data *op_data;
+ struct inode *inode = NULL;
+ struct ll_sb_info *sbi = ll_i2sbi(dir);
+ int tgt_len = 0;
+ int err;
+
+ if (unlikely(tgt != NULL))
+ tgt_len = strlen(tgt) + 1;
+
+ op_data = ll_prep_md_op_data(NULL, dir, NULL,
+ dentry->d_name.name,
+ dentry->d_name.len,
+ 0, opc, NULL);
+ if (IS_ERR(op_data)) {
+ err = PTR_ERR(op_data);
+ goto err_exit;
+ }
+
+ err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
+ from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns, current_fsgid()),
+ cfs_curproc_cap_pack(), rdev, &request);
+ ll_finish_md_op_data(op_data);
+ if (err)
+ goto err_exit;
+
+ ll_update_times(request, dir);
+
+ err = ll_prep_inode(&inode, request, dir->i_sb, NULL);
+ if (err)
+ goto err_exit;
+
+ d_instantiate(dentry, inode);
+err_exit:
+ ptlrpc_req_finished(request);
+
+ return err;
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild,
+ umode_t mode, dev_t rdev)
+{
+ int err;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p) mode %o dev %x\n",
+ dchild, dir->i_ino, dir->i_generation, dir,
+ mode, old_encode_dev(rdev));
+
+ if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+ mode &= ~current_umask();
+
+ switch (mode & S_IFMT) {
+ case 0:
+ mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+ case S_IFREG:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ err = ll_new_node(dir, dchild, NULL, mode,
+ old_encode_dev(rdev),
+ LUSTRE_OPC_MKNOD);
+ break;
+ case S_IFDIR:
+ err = -EPERM;
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+ if (!err)
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+
+ return err;
+}
+
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool want_excl)
+{
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),flags=%u, excl=%d\n",
+ dentry, dir->i_ino,
+ dir->i_generation, dir, mode, want_excl);
+
+ rc = ll_mknod(dir, dentry, mode, 0);
+
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n",
+ dentry, d_unhashed(dentry));
+
+ return rc;
+}
+
+static inline void ll_get_child_fid(struct dentry *child, struct lu_fid *fid)
+{
+ if (d_really_is_positive(child))
+ *fid = *ll_inode2fid(d_inode(child));
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+ struct ptlrpc_request *request = NULL;
+ struct md_op_data *op_data;
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+ namelen, name, dir->i_ino, dir->i_generation, dir);
+
+ op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+ S_IFDIR, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+ op_data->op_cli_flags |= CLI_RM_ENTRY;
+ rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+ ll_finish_md_op_data(op_data);
+ if (rc == 0) {
+ ll_update_times(request, dir);
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+ }
+
+ ptlrpc_req_finished(request);
+ return rc;
+}
+
+int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
+{
+ struct mdt_body *body;
+ struct lov_mds_md *eadata;
+ struct lov_stripe_md *lsm = NULL;
+ struct obd_trans_info oti = { 0 };
+ struct obdo *oa;
+ struct obd_capa *oc = NULL;
+ int rc;
+
+ /* req is swabbed so this is safe */
+ body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+ if (!(body->valid & OBD_MD_FLEASIZE))
+ return 0;
+
+ if (body->eadatasize == 0) {
+ CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
+ rc = -EPROTO;
+ goto out;
+ }
+
+ /* The MDS sent back the EA because we unlinked the last reference
+ * to this file. Use this EA to unlink the objects on the OST.
+ * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
+ * check it is complete and sensible. */
+ eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD,
+ body->eadatasize);
+ LASSERT(eadata != NULL);
+
+ rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize);
+ if (rc < 0) {
+ CERROR("obd_unpackmd: %d\n", rc);
+ goto out;
+ }
+ LASSERT(rc >= sizeof(*lsm));
+
+ OBDO_ALLOC(oa);
+ if (oa == NULL) {
+ rc = -ENOMEM;
+ goto out_free_memmd;
+ }
+
+ oa->o_oi = lsm->lsm_oi;
+ oa->o_mode = body->mode & S_IFMT;
+ oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP;
+
+ if (body->valid & OBD_MD_FLCOOKIE) {
+ oa->o_valid |= OBD_MD_FLCOOKIE;
+ oti.oti_logcookies =
+ req_capsule_server_sized_get(&request->rq_pill,
+ &RMF_LOGCOOKIES,
+ sizeof(struct llog_cookie) *
+ lsm->lsm_stripe_count);
+ if (oti.oti_logcookies == NULL) {
+ oa->o_valid &= ~OBD_MD_FLCOOKIE;
+ body->valid &= ~OBD_MD_FLCOOKIE;
+ }
+ }
+
+ if (body->valid & OBD_MD_FLOSSCAPA) {
+ rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc);
+ if (rc)
+ goto out_free_memmd;
+ }
+
+ rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti,
+ ll_i2mdexp(dir), oc);
+ capa_put(oc);
+ if (rc)
+ CERROR("obd destroy objid "DOSTID" error %d\n",
+ POSTID(&lsm->lsm_oi), rc);
+out_free_memmd:
+ obd_free_memmd(ll_i2dtexp(dir), &lsm);
+ OBDO_FREE(oa);
+out:
+ return rc;
+}
+
+/* ll_unlink() doesn't update the inode with the new link count.
+ * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there
+ * is any lock existing. They will recycle dentries and inodes based upon locks
+ * too. b=20433 */
+static int ll_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ptlrpc_request *request = NULL;
+ struct md_op_data *op_data;
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n",
+ dentry, dir->i_ino, dir->i_generation, dir);
+
+ op_data = ll_prep_md_op_data(NULL, dir, NULL,
+ dentry->d_name.name,
+ dentry->d_name.len,
+ 0, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ ll_get_child_fid(dentry, &op_data->op_fid3);
+ op_data->op_fid2 = op_data->op_fid3;
+ rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+ ll_finish_md_op_data(op_data);
+ if (rc)
+ goto out;
+
+ ll_update_times(request, dir);
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+ rc = ll_objects_destroy(request, dir);
+ out:
+ ptlrpc_req_finished(request);
+ return rc;
+}
+
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode)
+{
+ int err;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n",
+ dentry, dir->i_ino, dir->i_generation, dir);
+
+ if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+ mode &= ~current_umask();
+ mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+ err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR);
+
+ if (!err)
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
+
+ return err;
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct ptlrpc_request *request = NULL;
+ struct md_op_data *op_data;
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n",
+ dentry, dir->i_ino, dir->i_generation, dir);
+
+ op_data = ll_prep_md_op_data(NULL, dir, NULL,
+ dentry->d_name.name,
+ dentry->d_name.len,
+ S_IFDIR, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ ll_get_child_fid(dentry, &op_data->op_fid3);
+ op_data->op_fid2 = op_data->op_fid3;
+ rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+ ll_finish_md_op_data(op_data);
+ if (rc == 0) {
+ ll_update_times(request, dir);
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+ }
+
+ ptlrpc_req_finished(request);
+ return rc;
+}
+
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+ const char *oldname)
+{
+ int err;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p),target=%.*s\n",
+ dentry, dir->i_ino, dir->i_generation,
+ dir, 3000, oldname);
+
+ err = ll_new_node(dir, dentry, oldname, S_IFLNK | S_IRWXUGO,
+ 0, LUSTRE_OPC_SYMLINK);
+
+ if (!err)
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
+
+ return err;
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ struct inode *src = d_inode(old_dentry);
+ struct ll_sb_info *sbi = ll_i2sbi(dir);
+ struct ptlrpc_request *request = NULL;
+ struct md_op_data *op_data;
+ int err;
+
+ CDEBUG(D_VFSTRACE,
+ "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%pd\n",
+ src->i_ino, src->i_generation, src, dir->i_ino,
+ dir->i_generation, dir, new_dentry);
+
+ op_data = ll_prep_md_op_data(NULL, src, dir, new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ 0, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ err = md_link(sbi->ll_md_exp, op_data, &request);
+ ll_finish_md_op_data(op_data);
+ if (err)
+ goto out;
+
+ ll_update_times(request, dir);
+ ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
+out:
+ ptlrpc_req_finished(request);
+ return err;
+}
+
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct ptlrpc_request *request = NULL;
+ struct ll_sb_info *sbi = ll_i2sbi(old_dir);
+ struct md_op_data *op_data;
+ int err;
+
+ CDEBUG(D_VFSTRACE,
+ "VFS Op:oldname=%pd,src_dir=%lu/%u(%p),newname=%pd,tgt_dir=%lu/%u(%p)\n",
+ old_dentry, old_dir->i_ino, old_dir->i_generation, old_dir,
+ new_dentry, new_dir->i_ino, new_dir->i_generation, new_dir);
+
+ op_data = ll_prep_md_op_data(NULL, old_dir, new_dir, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ ll_get_child_fid(old_dentry, &op_data->op_fid3);
+ ll_get_child_fid(new_dentry, &op_data->op_fid4);
+ err = md_rename(sbi->ll_md_exp, op_data,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, &request);
+ ll_finish_md_op_data(op_data);
+ if (!err) {
+ ll_update_times(request, old_dir);
+ ll_update_times(request, new_dir);
+ ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
+ err = ll_objects_destroy(request, old_dir);
+ }
+
+ ptlrpc_req_finished(request);
+ if (!err)
+ d_move(old_dentry, new_dentry);
+ return err;
+}
+
+const struct inode_operations ll_dir_inode_operations = {
+ .mknod = ll_mknod,
+ .atomic_open = ll_atomic_open,
+ .lookup = ll_lookup_nd,
+ .create = ll_create_nd,
+ /* We need all these non-raw things for NFSD, to not patch it. */
+ .unlink = ll_unlink,
+ .mkdir = ll_mkdir,
+ .rmdir = ll_rmdir,
+ .symlink = ll_symlink,
+ .link = ll_link,
+ .rename = ll_rename,
+ .setattr = ll_setattr,
+ .getattr = ll_getattr,
+ .permission = ll_inode_permission,
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+ .get_acl = ll_get_acl,
+};
+
+const struct inode_operations ll_special_inode_operations = {
+ .setattr = ll_setattr,
+ .getattr = ll_getattr,
+ .permission = ll_inode_permission,
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+ .get_acl = ll_get_acl,
+};
diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c
new file mode 100644
index 000000000..a58182600
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/remote_perm.c
@@ -0,0 +1,331 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/remote_perm.c
+ *
+ * Lustre Permission Cache for Remote Client
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "../include/lustre_lite.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_param.h"
+#include "llite_internal.h"
+
+struct kmem_cache *ll_remote_perm_cachep = NULL;
+struct kmem_cache *ll_rmtperm_hash_cachep = NULL;
+
+static inline struct ll_remote_perm *alloc_ll_remote_perm(void)
+{
+ struct ll_remote_perm *lrp;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL);
+ if (lrp)
+ INIT_HLIST_NODE(&lrp->lrp_list);
+ return lrp;
+}
+
+static inline void free_ll_remote_perm(struct ll_remote_perm *lrp)
+{
+ if (!lrp)
+ return;
+
+ if (!hlist_unhashed(&lrp->lrp_list))
+ hlist_del(&lrp->lrp_list);
+ OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp));
+}
+
+static struct hlist_head *alloc_rmtperm_hash(void)
+{
+ struct hlist_head *hash;
+ int i;
+
+ OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep,
+ REMOTE_PERM_HASHSIZE * sizeof(*hash),
+ GFP_IOFS);
+ if (!hash)
+ return NULL;
+
+ for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+ INIT_HLIST_HEAD(hash + i);
+
+ return hash;
+}
+
+void free_rmtperm_hash(struct hlist_head *hash)
+{
+ int i;
+ struct ll_remote_perm *lrp;
+ struct hlist_node *next;
+
+ if (!hash)
+ return;
+
+ for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+ hlist_for_each_entry_safe(lrp, next, hash + i,
+ lrp_list)
+ free_ll_remote_perm(lrp);
+ OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep,
+ REMOTE_PERM_HASHSIZE * sizeof(*hash));
+}
+
+static inline int remote_perm_hashfunc(uid_t uid)
+{
+ return uid & (REMOTE_PERM_HASHSIZE - 1);
+}
+
+/* NB: setxid permission is not checked here, instead it's done on
+ * MDT when client get remote permission. */
+static int do_check_remote_perm(struct ll_inode_info *lli, int mask)
+{
+ struct hlist_head *head;
+ struct ll_remote_perm *lrp;
+ int found = 0, rc;
+
+ if (!lli->lli_remote_perms)
+ return -ENOENT;
+
+ head = lli->lli_remote_perms +
+ remote_perm_hashfunc(from_kuid(&init_user_ns, current_uid()));
+
+ spin_lock(&lli->lli_lock);
+ hlist_for_each_entry(lrp, head, lrp_list) {
+ if (lrp->lrp_uid != from_kuid(&init_user_ns, current_uid()))
+ continue;
+ if (lrp->lrp_gid != from_kgid(&init_user_ns, current_gid()))
+ continue;
+ if (lrp->lrp_fsuid != from_kuid(&init_user_ns, current_fsuid()))
+ continue;
+ if (lrp->lrp_fsgid != from_kgid(&init_user_ns, current_fsgid()))
+ continue;
+ found = 1;
+ break;
+ }
+
+ if (!found) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n",
+ lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+ lrp->lrp_access_perm);
+ rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES;
+
+out:
+ spin_unlock(&lli->lli_lock);
+ return rc;
+}
+
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_remote_perm *lrp = NULL, *tmp = NULL;
+ struct hlist_head *head, *perm_hash = NULL;
+
+ LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT);
+
+#if 0
+ if (perm->rp_uid != current->uid ||
+ perm->rp_gid != current->gid ||
+ perm->rp_fsuid != current->fsuid ||
+ perm->rp_fsgid != current->fsgid) {
+ /* user might setxid in this small period */
+ CDEBUG(D_SEC,
+ "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n",
+ perm->rp_uid, perm->rp_gid, perm->rp_fsuid,
+ perm->rp_fsgid, current->uid, current->gid,
+ current->fsuid, current->fsgid);
+ return -EAGAIN;
+ }
+#endif
+
+ if (!lli->lli_remote_perms) {
+ perm_hash = alloc_rmtperm_hash();
+ if (perm_hash == NULL) {
+ CERROR("alloc lli_remote_perms failed!\n");
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock(&lli->lli_lock);
+
+ if (!lli->lli_remote_perms)
+ lli->lli_remote_perms = perm_hash;
+ else
+ free_rmtperm_hash(perm_hash);
+
+ head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid);
+
+again:
+ hlist_for_each_entry(tmp, head, lrp_list) {
+ if (tmp->lrp_uid != perm->rp_uid)
+ continue;
+ if (tmp->lrp_gid != perm->rp_gid)
+ continue;
+ if (tmp->lrp_fsuid != perm->rp_fsuid)
+ continue;
+ if (tmp->lrp_fsgid != perm->rp_fsgid)
+ continue;
+ free_ll_remote_perm(lrp);
+ lrp = tmp;
+ break;
+ }
+
+ if (!lrp) {
+ spin_unlock(&lli->lli_lock);
+ lrp = alloc_ll_remote_perm();
+ if (!lrp) {
+ CERROR("alloc memory for ll_remote_perm failed!\n");
+ return -ENOMEM;
+ }
+ spin_lock(&lli->lli_lock);
+ goto again;
+ }
+
+ lrp->lrp_access_perm = perm->rp_access_perm;
+ if (lrp != tmp) {
+ lrp->lrp_uid = perm->rp_uid;
+ lrp->lrp_gid = perm->rp_gid;
+ lrp->lrp_fsuid = perm->rp_fsuid;
+ lrp->lrp_fsgid = perm->rp_fsgid;
+ hlist_add_head(&lrp->lrp_list, head);
+ }
+ lli->lli_rmtperm_time = cfs_time_current();
+ spin_unlock(&lli->lli_lock);
+
+ CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n",
+ lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+ lrp->lrp_access_perm);
+
+ return 0;
+}
+
+int lustre_check_remote_perm(struct inode *inode, int mask)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ptlrpc_request *req = NULL;
+ struct mdt_remote_perm *perm;
+ struct obd_capa *oc;
+ unsigned long save;
+ int i = 0, rc;
+
+ do {
+ save = lli->lli_rmtperm_time;
+ rc = do_check_remote_perm(lli, mask);
+ if (!rc || (rc != -ENOENT && i))
+ break;
+
+ might_sleep();
+
+ mutex_lock(&lli->lli_rmtperm_mutex);
+ /* check again */
+ if (save != lli->lli_rmtperm_time) {
+ rc = do_check_remote_perm(lli, mask);
+ if (!rc || (rc != -ENOENT && i)) {
+ mutex_unlock(&lli->lli_rmtperm_mutex);
+ break;
+ }
+ }
+
+ if (i++ > 5) {
+ CERROR("check remote perm falls in dead loop!\n");
+ LBUG();
+ }
+
+ oc = ll_mdscapa_get(inode);
+ rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+ ll_i2suppgid(inode), &req);
+ capa_put(oc);
+ if (rc) {
+ mutex_unlock(&lli->lli_rmtperm_mutex);
+ break;
+ }
+
+ perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL,
+ lustre_swab_mdt_remote_perm);
+ if (unlikely(perm == NULL)) {
+ mutex_unlock(&lli->lli_rmtperm_mutex);
+ rc = -EPROTO;
+ break;
+ }
+
+ rc = ll_update_remote_perm(inode, perm);
+ mutex_unlock(&lli->lli_rmtperm_mutex);
+ if (rc == -ENOMEM)
+ break;
+
+ ptlrpc_req_finished(req);
+ req = NULL;
+ } while (1);
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+#if 0 /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock,
+ * because it will fail sanity test 48.
+ */
+void ll_free_remote_perms(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct hlist_head *hash = lli->lli_remote_perms;
+ struct ll_remote_perm *lrp;
+ struct hlist_node *node, *next;
+ int i;
+
+ LASSERT(hash);
+
+ spin_lock(&lli->lli_lock);
+
+ for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) {
+ hlist_for_each_entry_safe(lrp, node, next, hash + i,
+ lrp_list)
+ free_ll_remote_perm(lrp);
+ }
+
+ spin_unlock(&lli->lli_lock);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
new file mode 100644
index 000000000..991d20c50
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -0,0 +1,1289 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <linux/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "../include/obd_cksum.h"
+#include "llite_internal.h"
+#include "../include/linux/lustre_compat25.h"
+
+/**
+ * Finalizes cl-data before exiting typical address_space operation. Dual to
+ * ll_cl_init().
+ */
+static void ll_cl_fini(struct ll_cl_context *lcc)
+{
+ struct lu_env *env = lcc->lcc_env;
+ struct cl_io *io = lcc->lcc_io;
+ struct cl_page *page = lcc->lcc_page;
+
+ LASSERT(lcc->lcc_cookie == current);
+ LASSERT(env != NULL);
+
+ if (page != NULL) {
+ lu_ref_del(&page->cp_reference, "cl_io", io);
+ cl_page_put(env, page);
+ }
+
+ cl_env_put(env, &lcc->lcc_refcheck);
+}
+
+/**
+ * Initializes common cl-data at the typical address_space operation entry
+ * point.
+ */
+static struct ll_cl_context *ll_cl_init(struct file *file,
+ struct page *vmpage, int create)
+{
+ struct ll_cl_context *lcc;
+ struct lu_env *env;
+ struct cl_io *io;
+ struct cl_object *clob;
+ struct ccc_io *cio;
+
+ int refcheck;
+ int result = 0;
+
+ clob = ll_i2info(vmpage->mapping->host)->lli_clob;
+ LASSERT(clob != NULL);
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return ERR_CAST(env);
+
+ lcc = &vvp_env_info(env)->vti_io_ctx;
+ memset(lcc, 0, sizeof(*lcc));
+ lcc->lcc_env = env;
+ lcc->lcc_refcheck = refcheck;
+ lcc->lcc_cookie = current;
+
+ cio = ccc_env_io(env);
+ io = cio->cui_cl.cis_io;
+ if (io == NULL && create) {
+ struct inode *inode = vmpage->mapping->host;
+ loff_t pos;
+
+ if (mutex_trylock(&inode->i_mutex)) {
+ mutex_unlock(&(inode)->i_mutex);
+
+ /* this is too bad. Someone is trying to write the
+ * page w/o holding inode mutex. This means we can
+ * add dirty pages into cache during truncate */
+ CERROR("Proc %s is dirtying page w/o inode lock, this will break truncate\n",
+ current->comm);
+ dump_stack();
+ LBUG();
+ return ERR_PTR(-EIO);
+ }
+
+ /*
+ * Loop-back driver calls ->prepare_write().
+ * methods directly, bypassing file system ->write() operation,
+ * so cl_io has to be created here.
+ */
+ io = ccc_env_thread_io(env);
+ ll_io_init(io, file, 1);
+
+ /* No lock at all for this kind of IO - we can't do it because
+ * we have held page lock, it would cause deadlock.
+ * XXX: This causes poor performance to loop device - One page
+ * per RPC.
+ * In order to get better performance, users should use
+ * lloop driver instead.
+ */
+ io->ci_lockreq = CILR_NEVER;
+
+ pos = vmpage->index << PAGE_CACHE_SHIFT;
+
+ /* Create a temp IO to serve write. */
+ result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
+ if (result == 0) {
+ cio->cui_fd = LUSTRE_FPRIVATE(file);
+ cio->cui_iter = NULL;
+ result = cl_io_iter_init(env, io);
+ if (result == 0) {
+ result = cl_io_lock(env, io);
+ if (result == 0)
+ result = cl_io_start(env, io);
+ }
+ } else
+ result = io->ci_result;
+ }
+
+ lcc->lcc_io = io;
+ if (io == NULL)
+ result = -EIO;
+ if (result == 0) {
+ struct cl_page *page;
+
+ LASSERT(io != NULL);
+ LASSERT(io->ci_state == CIS_IO_GOING);
+ LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file));
+ page = cl_page_find(env, clob, vmpage->index, vmpage,
+ CPT_CACHEABLE);
+ if (!IS_ERR(page)) {
+ lcc->lcc_page = page;
+ lu_ref_add(&page->cp_reference, "cl_io", io);
+ result = 0;
+ } else
+ result = PTR_ERR(page);
+ }
+ if (result) {
+ ll_cl_fini(lcc);
+ lcc = ERR_PTR(result);
+ }
+
+ CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n",
+ vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result,
+ env, io);
+ return lcc;
+}
+
+static struct ll_cl_context *ll_cl_get(void)
+{
+ struct ll_cl_context *lcc;
+ struct lu_env *env;
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ LASSERT(!IS_ERR(env));
+ lcc = &vvp_env_info(env)->vti_io_ctx;
+ LASSERT(env == lcc->lcc_env);
+ LASSERT(current == lcc->lcc_cookie);
+ cl_env_put(env, &refcheck);
+
+ /* env has got in ll_cl_init, so it is still usable. */
+ return lcc;
+}
+
+/**
+ * ->prepare_write() address space operation called by generic_file_write()
+ * for every page during write.
+ */
+int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from,
+ unsigned to)
+{
+ struct ll_cl_context *lcc;
+ int result;
+
+ lcc = ll_cl_init(file, vmpage, 1);
+ if (!IS_ERR(lcc)) {
+ struct lu_env *env = lcc->lcc_env;
+ struct cl_io *io = lcc->lcc_io;
+ struct cl_page *page = lcc->lcc_page;
+
+ cl_page_assume(env, io, page);
+
+ result = cl_io_prepare_write(env, io, page, from, to);
+ if (result == 0) {
+ /*
+ * Add a reference, so that page is not evicted from
+ * the cache until ->commit_write() is called.
+ */
+ cl_page_get(page);
+ lu_ref_add(&page->cp_reference, "prepare_write",
+ current);
+ } else {
+ cl_page_unassume(env, io, page);
+ ll_cl_fini(lcc);
+ }
+ /* returning 0 in prepare assumes commit must be called
+ * afterwards */
+ } else {
+ result = PTR_ERR(lcc);
+ }
+ return result;
+}
+
+int ll_commit_write(struct file *file, struct page *vmpage, unsigned from,
+ unsigned to)
+{
+ struct ll_cl_context *lcc;
+ struct lu_env *env;
+ struct cl_io *io;
+ struct cl_page *page;
+ int result = 0;
+
+ lcc = ll_cl_get();
+ env = lcc->lcc_env;
+ page = lcc->lcc_page;
+ io = lcc->lcc_io;
+
+ LASSERT(cl_page_is_owned(page, io));
+ LASSERT(from <= to);
+ if (from != to) /* handle short write case. */
+ result = cl_io_commit_write(env, io, page, from, to);
+ if (cl_page_is_owned(page, io))
+ cl_page_unassume(env, io, page);
+
+ /*
+ * Release reference acquired by ll_prepare_write().
+ */
+ lu_ref_del(&page->cp_reference, "prepare_write", current);
+ cl_page_put(env, page);
+ ll_cl_fini(lcc);
+ return result;
+}
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt)
+{
+ __u64 opc;
+
+ opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
+ return ll_osscapa_get(inode, opc);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+ struct ra_io_arg *ria,
+ unsigned long pages)
+{
+ struct ll_ra_info *ra = &sbi->ll_ra_info;
+ long ret;
+
+ /* If read-ahead pages left are less than 1M, do not do read-ahead,
+ * otherwise it will form small read RPC(< 1M), which hurt server
+ * performance a lot. */
+ ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages);
+ if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) {
+ ret = 0;
+ goto out;
+ }
+
+ /* If the non-strided (ria_pages == 0) readahead window
+ * (ria_start + ret) has grown across an RPC boundary, then trim
+ * readahead size by the amount beyond the RPC so it ends on an
+ * RPC boundary. If the readahead window is already ending on
+ * an RPC boundary (beyond_rpc == 0), or smaller than a full
+ * RPC (beyond_rpc < ret) the readahead size is unchanged.
+ * The (beyond_rpc != 0) check is skipped since the conditional
+ * branch is more expensive than subtracting zero from the result.
+ *
+ * Strided read is left unaligned to avoid small fragments beyond
+ * the RPC boundary from needing an extra read RPC. */
+ if (ria->ria_pages == 0) {
+ long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
+ if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
+ ret -= beyond_rpc;
+ }
+
+ if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+ atomic_sub(ret, &ra->ra_cur_pages);
+ ret = 0;
+ }
+
+out:
+ return ret;
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+{
+ struct ll_ra_info *ra = &sbi->ll_ra_info;
+ atomic_sub(len, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+ LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
+ lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
+ ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+ CDEBUG(D_READA, \
+ "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \
+ "csr %lu sf %lu sp %lu sl %lu \n", \
+ ras->ras_last_readpage, ras->ras_consecutive_requests, \
+ ras->ras_consecutive_pages, ras->ras_window_start, \
+ ras->ras_window_len, ras->ras_next_readahead, \
+ ras->ras_requests, ras->ras_request_index, \
+ ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+ ras->ras_stride_pages, ras->ras_stride_length)
+
+static int index_in_window(unsigned long index, unsigned long point,
+ unsigned long before, unsigned long after)
+{
+ unsigned long start = point - before, end = point + after;
+
+ if (start > point)
+ start = 0;
+ if (end < point)
+ end = ~0;
+
+ return start <= index && index <= end;
+}
+
+static struct ll_readahead_state *ll_ras_get(struct file *f)
+{
+ struct ll_file_data *fd;
+
+ fd = LUSTRE_FPRIVATE(f);
+ return &fd->fd_ras;
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+{
+ struct ll_readahead_state *ras;
+
+ ras = ll_ras_get(f);
+
+ spin_lock(&ras->ras_lock);
+ ras->ras_requests++;
+ ras->ras_request_index = 0;
+ ras->ras_consecutive_requests++;
+ rar->lrr_reader = current;
+
+ list_add(&rar->lrr_linkage, &ras->ras_read_beads);
+ spin_unlock(&ras->ras_lock);
+}
+
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
+{
+ struct ll_readahead_state *ras;
+
+ ras = ll_ras_get(f);
+
+ spin_lock(&ras->ras_lock);
+ list_del_init(&rar->lrr_linkage);
+ spin_unlock(&ras->ras_lock);
+}
+
+static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
+{
+ struct ll_ra_read *scan;
+
+ list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
+ if (scan->lrr_reader == current)
+ return scan;
+ }
+ return NULL;
+}
+
+struct ll_ra_read *ll_ra_read_get(struct file *f)
+{
+ struct ll_readahead_state *ras;
+ struct ll_ra_read *bead;
+
+ ras = ll_ras_get(f);
+
+ spin_lock(&ras->ras_lock);
+ bead = ll_ra_read_get_locked(ras);
+ spin_unlock(&ras->ras_lock);
+ return bead;
+}
+
+static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue, struct cl_page *page,
+ struct page *vmpage)
+{
+ struct ccc_page *cp;
+ int rc;
+
+ rc = 0;
+ cl_page_assume(env, io, page);
+ lu_ref_add(&page->cp_reference, "ra", current);
+ cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+ if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) {
+ rc = cl_page_is_under_lock(env, io, page);
+ if (rc == -EBUSY) {
+ cp->cpg_defer_uptodate = 1;
+ cp->cpg_ra_used = 0;
+ cl_page_list_add(queue, page);
+ rc = 1;
+ } else {
+ cl_page_delete(env, page);
+ rc = -ENOLCK;
+ }
+ } else {
+ /* skip completed pages */
+ cl_page_unassume(env, io, page);
+ }
+ lu_ref_del(&page->cp_reference, "ra", current);
+ cl_page_put(env, page);
+ return rc;
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval +ve: page was added to \a queue.
+ *
+ * \retval -ENOLCK: there is no extent lock for this part of a file, stop
+ * read-ahead.
+ *
+ * \retval -ve, 0: page wasn't added to \a queue for other reason.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue,
+ pgoff_t index, struct address_space *mapping)
+{
+ struct page *vmpage;
+ struct cl_object *clob = ll_i2info(mapping->host)->lli_clob;
+ struct cl_page *page;
+ enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */
+ int rc = 0;
+ const char *msg = NULL;
+
+ vmpage = grab_cache_page_nowait(mapping, index);
+ if (vmpage != NULL) {
+ /* Check if vmpage was truncated or reclaimed */
+ if (vmpage->mapping == mapping) {
+ page = cl_page_find(env, clob, vmpage->index,
+ vmpage, CPT_CACHEABLE);
+ if (!IS_ERR(page)) {
+ rc = cl_read_ahead_page(env, io, queue,
+ page, vmpage);
+ if (rc == -ENOLCK) {
+ which = RA_STAT_FAILED_MATCH;
+ msg = "lock match failed";
+ }
+ } else {
+ which = RA_STAT_FAILED_GRAB_PAGE;
+ msg = "cl_page_find failed";
+ }
+ } else {
+ which = RA_STAT_WRONG_GRAB_PAGE;
+ msg = "g_c_p_n returned invalid page";
+ }
+ if (rc != 1)
+ unlock_page(vmpage);
+ page_cache_release(vmpage);
+ } else {
+ which = RA_STAT_FAILED_GRAB_PAGE;
+ msg = "g_c_p_n failed";
+ }
+ if (msg != NULL) {
+ ll_ra_stats_inc(mapping, which);
+ CDEBUG(D_READA, "%s\n", msg);
+ }
+ return rc;
+}
+
+#define RIA_DEBUG(ria) \
+ CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \
+ ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
+ ria->ria_pages)
+
+/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
+ * know what the actual RPC size is. If this needs to change, it makes more
+ * sense to tune the i_blkbits value for the file based on the OSTs it is
+ * striped over, rather than having a constant value for all files here. */
+
+/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)).
+ * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
+ * by default, this should be adjusted corresponding with max_read_ahead_mb
+ * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
+ * up quickly which will affect read performance significantly. See LU-2816 */
+#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+ return ras->ras_consecutive_stride_requests > 1;
+}
+/* The function calculates how much pages will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_length = st_len,
+ * stride_pages = st_pgs
+ *
+ * |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ * |--- st_pgs ---|
+ * |----- st_len -----|
+ *
+ * How many pages it should read in such pattern
+ * |-------------------------------------------------------------|
+ * off
+ * |<------ length ------->|
+ *
+ * = |<----->| + |-------------------------------------| + |---|
+ * start_left st_pgs * i end_left
+ */
+static unsigned long
+stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
+ unsigned long off, unsigned long length)
+{
+ __u64 start = off > st_off ? off - st_off : 0;
+ __u64 end = off + length > st_off ? off + length - st_off : 0;
+ unsigned long start_left = 0;
+ unsigned long end_left = 0;
+ unsigned long pg_count;
+
+ if (st_len == 0 || length == 0 || end == 0)
+ return length;
+
+ start_left = do_div(start, st_len);
+ if (start_left < st_pgs)
+ start_left = st_pgs - start_left;
+ else
+ start_left = 0;
+
+ end_left = do_div(end, st_len);
+ if (end_left > st_pgs)
+ end_left = st_pgs;
+
+ CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu \n",
+ start, end, start_left, end_left);
+
+ if (start == end)
+ pg_count = end_left - (st_pgs - start_left);
+ else
+ pg_count = start_left + st_pgs * (end - start - 1) + end_left;
+
+ CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu pgcount %lu\n",
+ st_off, st_len, st_pgs, off, length, pg_count);
+
+ return pg_count;
+}
+
+static int ria_page_count(struct ra_io_arg *ria)
+{
+ __u64 length = ria->ria_end >= ria->ria_start ?
+ ria->ria_end - ria->ria_start + 1 : 0;
+
+ return stride_pg_count(ria->ria_stoff, ria->ria_length,
+ ria->ria_pages, ria->ria_start,
+ length);
+}
+
+/*Check whether the index is in the defined ra-window */
+static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+{
+ /* If ria_length == ria_pages, it means non-stride I/O mode,
+ * idx should always inside read-ahead window in this case
+ * For stride I/O mode, just check whether the idx is inside
+ * the ria_pages. */
+ return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
+ (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
+ ria->ria_length < ria->ria_pages);
+}
+
+static int ll_read_ahead_pages(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *queue,
+ struct ra_io_arg *ria,
+ unsigned long *reserved_pages,
+ struct address_space *mapping,
+ unsigned long *ra_end)
+{
+ int rc, count = 0, stride_ria;
+ unsigned long page_idx;
+
+ LASSERT(ria != NULL);
+ RIA_DEBUG(ria);
+
+ stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
+ for (page_idx = ria->ria_start; page_idx <= ria->ria_end &&
+ *reserved_pages > 0; page_idx++) {
+ if (ras_inside_ra_window(page_idx, ria)) {
+ /* If the page is inside the read-ahead window*/
+ rc = ll_read_ahead_page(env, io, queue,
+ page_idx, mapping);
+ if (rc == 1) {
+ (*reserved_pages)--;
+ count ++;
+ } else if (rc == -ENOLCK)
+ break;
+ } else if (stride_ria) {
+ /* If it is not in the read-ahead window, and it is
+ * read-ahead mode, then check whether it should skip
+ * the stride gap */
+ pgoff_t offset;
+ /* FIXME: This assertion only is valid when it is for
+ * forward read-ahead, it will be fixed when backward
+ * read-ahead is implemented */
+ LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n",
+ page_idx,
+ ria->ria_start, ria->ria_end, ria->ria_stoff,
+ ria->ria_length, ria->ria_pages);
+ offset = page_idx - ria->ria_stoff;
+ offset = offset % (ria->ria_length);
+ if (offset > ria->ria_pages) {
+ page_idx += ria->ria_length - offset;
+ CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
+ ria->ria_length - offset);
+ continue;
+ }
+ }
+ }
+ *ra_end = page_idx;
+ return count;
+}
+
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+ struct ll_readahead_state *ras, struct address_space *mapping,
+ struct cl_page_list *queue, int flags)
+{
+ struct vvp_io *vio = vvp_env_io(env);
+ struct vvp_thread_info *vti = vvp_env_info(env);
+ struct cl_attr *attr = ccc_env_thread_attr(env);
+ unsigned long start = 0, end = 0, reserved;
+ unsigned long ra_end, len;
+ struct inode *inode;
+ struct ll_ra_read *bead;
+ struct ra_io_arg *ria = &vti->vti_ria;
+ struct ll_inode_info *lli;
+ struct cl_object *clob;
+ int ret = 0;
+ __u64 kms;
+
+ inode = mapping->host;
+ lli = ll_i2info(inode);
+ clob = lli->lli_clob;
+
+ memset(ria, 0, sizeof(*ria));
+
+ cl_object_attr_lock(clob);
+ ret = cl_object_attr_get(env, clob, attr);
+ cl_object_attr_unlock(clob);
+
+ if (ret != 0)
+ return ret;
+ kms = attr->cat_kms;
+ if (kms == 0) {
+ ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
+ return 0;
+ }
+
+ spin_lock(&ras->ras_lock);
+ if (vio->cui_ra_window_set)
+ bead = &vio->cui_bead;
+ else
+ bead = NULL;
+
+ /* Enlarge the RA window to encompass the full read */
+ if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+ bead->lrr_start + bead->lrr_count) {
+ ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+ ras->ras_window_start;
+ }
+ /* Reserve a part of the read-ahead window that we'll be issuing */
+ if (ras->ras_window_len) {
+ start = ras->ras_next_readahead;
+ end = ras->ras_window_start + ras->ras_window_len - 1;
+ }
+ if (end != 0) {
+ unsigned long rpc_boundary;
+ /*
+ * Align RA window to an optimal boundary.
+ *
+ * XXX This would be better to align to cl_max_pages_per_rpc
+ * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
+ * be aligned to the RAID stripe size in the future and that
+ * is more important than the RPC size.
+ */
+ /* Note: we only trim the RPC, instead of extending the RPC
+ * to the boundary, so to avoid reading too much pages during
+ * random reading. */
+ rpc_boundary = (end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1));
+ if (rpc_boundary > 0)
+ rpc_boundary--;
+
+ if (rpc_boundary > start)
+ end = rpc_boundary;
+
+ /* Truncate RA window to end of file */
+ end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
+
+ ras->ras_next_readahead = max(end, end + 1);
+ RAS_CDEBUG(ras);
+ }
+ ria->ria_start = start;
+ ria->ria_end = end;
+ /* If stride I/O mode is detected, get stride window*/
+ if (stride_io_mode(ras)) {
+ ria->ria_stoff = ras->ras_stride_offset;
+ ria->ria_length = ras->ras_stride_length;
+ ria->ria_pages = ras->ras_stride_pages;
+ }
+ spin_unlock(&ras->ras_lock);
+
+ if (end == 0) {
+ ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
+ return 0;
+ }
+ len = ria_page_count(ria);
+ if (len == 0)
+ return 0;
+
+ reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len);
+ if (reserved < len)
+ ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
+
+ CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved,
+ atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+ ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+ ret = ll_read_ahead_pages(env, io, queue,
+ ria, &reserved, mapping, &ra_end);
+
+ LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
+ if (reserved != 0)
+ ll_ra_count_put(ll_i2sbi(inode), reserved);
+
+ if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+ ll_ra_stats_inc(mapping, RA_STAT_EOF);
+
+ /* if we didn't get to the end of the region we reserved from
+ * the ras we need to go back and update the ras so that the
+ * next read-ahead tries from where we left off. we only do so
+ * if the region we failed to issue read-ahead on is still ahead
+ * of the app and behind the next index to start read-ahead from */
+ CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
+ ra_end, end, ria->ria_end);
+
+ if (ra_end != end + 1) {
+ spin_lock(&ras->ras_lock);
+ if (ra_end < ras->ras_next_readahead &&
+ index_in_window(ra_end, ras->ras_window_start, 0,
+ ras->ras_window_len)) {
+ ras->ras_next_readahead = ra_end;
+ RAS_CDEBUG(ras);
+ }
+ spin_unlock(&ras->ras_lock);
+ }
+
+ return ret;
+}
+
+static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
+ unsigned long index)
+{
+ ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct inode *inode, struct ll_readahead_state *ras,
+ unsigned long index)
+{
+ ras->ras_last_readpage = index;
+ ras->ras_consecutive_requests = 0;
+ ras->ras_consecutive_pages = 0;
+ ras->ras_window_len = 0;
+ ras_set_start(inode, ras, index);
+ ras->ras_next_readahead = max(ras->ras_window_start, index);
+
+ RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+ ras->ras_consecutive_stride_requests = 0;
+ ras->ras_stride_length = 0;
+ ras->ras_stride_pages = 0;
+ RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+ spin_lock_init(&ras->ras_lock);
+ ras_reset(inode, ras, 0);
+ ras->ras_requests = 0;
+ INIT_LIST_HEAD(&ras->ras_read_beads);
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return 1, otherwise return 0.
+ */
+static int index_in_stride_window(struct ll_readahead_state *ras,
+ unsigned long index)
+{
+ unsigned long stride_gap;
+
+ if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 ||
+ ras->ras_stride_pages == ras->ras_stride_length)
+ return 0;
+
+ stride_gap = index - ras->ras_last_readpage - 1;
+
+ /* If it is contiguous read */
+ if (stride_gap == 0)
+ return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+
+ /* Otherwise check the stride by itself */
+ return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+ ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+ unsigned long index)
+{
+ unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+
+ if (!stride_io_mode(ras) && (stride_gap != 0 ||
+ ras->ras_consecutive_stride_requests == 0)) {
+ ras->ras_stride_pages = ras->ras_consecutive_pages;
+ ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+ }
+ LASSERT(ras->ras_request_index == 0);
+ LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+ if (index <= ras->ras_last_readpage) {
+ /*Reset stride window for forward read*/
+ ras_stride_reset(ras);
+ return;
+ }
+
+ ras->ras_stride_pages = ras->ras_consecutive_pages;
+ ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+
+ RAS_CDEBUG(ras);
+ return;
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, unsigned long len)
+{
+ return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
+ ras->ras_stride_pages, ras->ras_stride_offset,
+ len);
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+ struct ll_ra_info *ra,
+ unsigned long inc_len)
+{
+ unsigned long left, step, window_len;
+ unsigned long stride_len;
+
+ LASSERT(ras->ras_stride_length > 0);
+ LASSERTF(ras->ras_window_start + ras->ras_window_len
+ >= ras->ras_stride_offset, "window_start %lu, window_len %lu stride_offset %lu\n",
+ ras->ras_window_start,
+ ras->ras_window_len, ras->ras_stride_offset);
+
+ stride_len = ras->ras_window_start + ras->ras_window_len -
+ ras->ras_stride_offset;
+
+ left = stride_len % ras->ras_stride_length;
+ window_len = ras->ras_window_len - left;
+
+ if (left < ras->ras_stride_pages)
+ left += inc_len;
+ else
+ left = ras->ras_stride_pages + inc_len;
+
+ LASSERT(ras->ras_stride_pages != 0);
+
+ step = left / ras->ras_stride_pages;
+ left %= ras->ras_stride_pages;
+
+ window_len += step * ras->ras_stride_length + left;
+
+ if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
+ ras->ras_window_len = window_len;
+
+ RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+ struct ll_readahead_state *ras,
+ struct ll_ra_info *ra)
+{
+ /* The stretch of ra-window should be aligned with max rpc_size
+ * but current clio architecture does not support retrieve such
+ * information from lower layer. FIXME later
+ */
+ if (stride_io_mode(ras))
+ ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
+ else
+ ras->ras_window_len = min(ras->ras_window_len +
+ RAS_INCREASE_STEP(inode),
+ ra->ra_max_pages_per_file);
+}
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+ struct ll_readahead_state *ras, unsigned long index,
+ unsigned hit)
+{
+ struct ll_ra_info *ra = &sbi->ll_ra_info;
+ int zero = 0, stride_detect = 0, ra_miss = 0;
+
+ spin_lock(&ras->ras_lock);
+
+ ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+ /* reset the read-ahead window in two cases. First when the app seeks
+ * or reads to some other part of the file. Secondly if we get a
+ * read-ahead miss that we think we've previously issued. This can
+ * be a symptom of there being so many read-ahead pages that the VM is
+ * reclaiming it before we get to it. */
+ if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+ zero = 1;
+ ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+ } else if (!hit && ras->ras_window_len &&
+ index < ras->ras_next_readahead &&
+ index_in_window(index, ras->ras_window_start, 0,
+ ras->ras_window_len)) {
+ ra_miss = 1;
+ ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+ }
+
+ /* On the second access to a file smaller than the tunable
+ * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+ * file up to ra_max_pages_per_file. This is simply a best effort
+ * and only occurs once per open file. Normal RA behavior is reverted
+ * to for subsequent IO. The mmap case does not increment
+ * ras_requests and thus can never trigger this behavior. */
+ if (ras->ras_requests == 2 && !ras->ras_request_index) {
+ __u64 kms_pages;
+
+ kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
+ ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
+
+ if (kms_pages &&
+ kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+ ras->ras_window_start = 0;
+ ras->ras_last_readpage = 0;
+ ras->ras_next_readahead = 0;
+ ras->ras_window_len = min(ra->ra_max_pages_per_file,
+ ra->ra_max_read_ahead_whole_pages);
+ goto out_unlock;
+ }
+ }
+ if (zero) {
+ /* check whether it is in stride I/O mode*/
+ if (!index_in_stride_window(ras, index)) {
+ if (ras->ras_consecutive_stride_requests == 0 &&
+ ras->ras_request_index == 0) {
+ ras_update_stride_detector(ras, index);
+ ras->ras_consecutive_stride_requests++;
+ } else {
+ ras_stride_reset(ras);
+ }
+ ras_reset(inode, ras, index);
+ ras->ras_consecutive_pages++;
+ goto out_unlock;
+ } else {
+ ras->ras_consecutive_pages = 0;
+ ras->ras_consecutive_requests = 0;
+ if (++ras->ras_consecutive_stride_requests > 1)
+ stride_detect = 1;
+ RAS_CDEBUG(ras);
+ }
+ } else {
+ if (ra_miss) {
+ if (index_in_stride_window(ras, index) &&
+ stride_io_mode(ras)) {
+ /*If stride-RA hit cache miss, the stride dector
+ *will not be reset to avoid the overhead of
+ *redetecting read-ahead mode */
+ if (index != ras->ras_last_readpage + 1)
+ ras->ras_consecutive_pages = 0;
+ ras_reset(inode, ras, index);
+ RAS_CDEBUG(ras);
+ } else {
+ /* Reset both stride window and normal RA
+ * window */
+ ras_reset(inode, ras, index);
+ ras->ras_consecutive_pages++;
+ ras_stride_reset(ras);
+ goto out_unlock;
+ }
+ } else if (stride_io_mode(ras)) {
+ /* If this is contiguous read but in stride I/O mode
+ * currently, check whether stride step still is valid,
+ * if invalid, it will reset the stride ra window*/
+ if (!index_in_stride_window(ras, index)) {
+ /* Shrink stride read-ahead window to be zero */
+ ras_stride_reset(ras);
+ ras->ras_window_len = 0;
+ ras->ras_next_readahead = index;
+ }
+ }
+ }
+ ras->ras_consecutive_pages++;
+ ras->ras_last_readpage = index;
+ ras_set_start(inode, ras, index);
+
+ if (stride_io_mode(ras))
+ /* Since stride readahead is sensitive to the offset
+ * of read-ahead, so we use original offset here,
+ * instead of ras_window_start, which is RPC aligned */
+ ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+ else
+ ras->ras_next_readahead = max(ras->ras_window_start,
+ ras->ras_next_readahead);
+ RAS_CDEBUG(ras);
+
+ /* Trigger RA in the mmap case where ras_consecutive_requests
+ * is not incremented and thus can't be used to trigger RA */
+ if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
+ ras->ras_window_len = RAS_INCREASE_STEP(inode);
+ goto out_unlock;
+ }
+
+ /* Initially reset the stride window offset to next_readahead*/
+ if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
+ /**
+ * Once stride IO mode is detected, next_readahead should be
+ * reset to make sure next_readahead > stride offset
+ */
+ ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+ ras->ras_stride_offset = index;
+ ras->ras_window_len = RAS_INCREASE_STEP(inode);
+ }
+
+ /* The initial ras_window_len is set to the request size. To avoid
+ * uselessly reading and discarding pages for random IO the window is
+ * only increased once per consecutive request received. */
+ if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
+ !ras->ras_request_index)
+ ras_increase_window(inode, ras, ra);
+out_unlock:
+ RAS_CDEBUG(ras);
+ ras->ras_request_index++;
+ spin_unlock(&ras->ras_lock);
+ return;
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+ struct inode *inode = vmpage->mapping->host;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct lu_env *env;
+ struct cl_io *io;
+ struct cl_page *page;
+ struct cl_object *clob;
+ struct cl_env_nest nest;
+ bool redirtied = false;
+ bool unlocked = false;
+ int result;
+
+ LASSERT(PageLocked(vmpage));
+ LASSERT(!PageWriteback(vmpage));
+
+ LASSERT(ll_i2dtexp(inode) != NULL);
+
+ env = cl_env_nested_get(&nest);
+ if (IS_ERR(env)) {
+ result = PTR_ERR(env);
+ goto out;
+ }
+
+ clob = ll_i2info(inode)->lli_clob;
+ LASSERT(clob != NULL);
+
+ io = ccc_env_thread_io(env);
+ io->ci_obj = clob;
+ io->ci_ignore_layout = 1;
+ result = cl_io_init(env, io, CIT_MISC, clob);
+ if (result == 0) {
+ page = cl_page_find(env, clob, vmpage->index,
+ vmpage, CPT_CACHEABLE);
+ if (!IS_ERR(page)) {
+ lu_ref_add(&page->cp_reference, "writepage",
+ current);
+ cl_page_assume(env, io, page);
+ result = cl_page_flush(env, io, page);
+ if (result != 0) {
+ /*
+ * Re-dirty page on error so it retries write,
+ * but not in case when IO has actually
+ * occurred and completed with an error.
+ */
+ if (!PageError(vmpage)) {
+ redirty_page_for_writepage(wbc, vmpage);
+ result = 0;
+ redirtied = true;
+ }
+ }
+ cl_page_disown(env, io, page);
+ unlocked = true;
+ lu_ref_del(&page->cp_reference,
+ "writepage", current);
+ cl_page_put(env, page);
+ } else {
+ result = PTR_ERR(page);
+ }
+ }
+ cl_io_fini(env, io);
+
+ if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+ loff_t offset = cl_offset(clob, vmpage->index);
+
+ /* Flush page failed because the extent is being written out.
+ * Wait for the write of extent to be finished to avoid
+ * breaking kernel which assumes ->writepage should mark
+ * PageWriteback or clean the page. */
+ result = cl_sync_file_range(inode, offset,
+ offset + PAGE_CACHE_SIZE - 1,
+ CL_FSYNC_LOCAL, 1);
+ if (result > 0) {
+ /* actually we may have written more than one page.
+ * decreasing this page because the caller will count
+ * it. */
+ wbc->nr_to_write -= result - 1;
+ result = 0;
+ }
+ }
+
+ cl_env_nested_put(&nest, env);
+ goto out;
+
+out:
+ if (result < 0) {
+ if (!lli->lli_async_rc)
+ lli->lli_async_rc = result;
+ SetPageError(vmpage);
+ if (!unlocked)
+ unlock_page(vmpage);
+ }
+ return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ loff_t start;
+ loff_t end;
+ enum cl_fsync_mode mode;
+ int range_whole = 0;
+ int result;
+ int ignore_layout = 0;
+
+ if (wbc->range_cyclic) {
+ start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+ end = OBD_OBJECT_EOF;
+ } else {
+ start = wbc->range_start;
+ end = wbc->range_end;
+ if (end == LLONG_MAX) {
+ end = OBD_OBJECT_EOF;
+ range_whole = start == 0;
+ }
+ }
+
+ mode = CL_FSYNC_NONE;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ mode = CL_FSYNC_LOCAL;
+
+ if (sbi->ll_umounting)
+ /* if the mountpoint is being umounted, all pages have to be
+ * evicted to avoid hitting LBUG when truncate_inode_pages()
+ * is called later on. */
+ ignore_layout = 1;
+ result = cl_sync_file_range(inode, start, end, mode, ignore_layout);
+ if (result > 0) {
+ wbc->nr_to_write -= result;
+ result = 0;
+ }
+
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+ if (end == OBD_OBJECT_EOF)
+ end = i_size_read(inode);
+ mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1;
+ }
+ return result;
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+ struct ll_cl_context *lcc;
+ int result;
+
+ lcc = ll_cl_init(file, vmpage, 0);
+ if (!IS_ERR(lcc)) {
+ struct lu_env *env = lcc->lcc_env;
+ struct cl_io *io = lcc->lcc_io;
+ struct cl_page *page = lcc->lcc_page;
+
+ LASSERT(page->cp_type == CPT_CACHEABLE);
+ if (likely(!PageUptodate(vmpage))) {
+ cl_page_assume(env, io, page);
+ result = cl_io_read_page(env, io, page);
+ } else {
+ /* Page from a non-object file. */
+ unlock_page(vmpage);
+ result = 0;
+ }
+ ll_cl_fini(lcc);
+ } else {
+ unlock_page(vmpage);
+ result = PTR_ERR(lcc);
+ }
+ return result;
+}
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
new file mode 100644
index 000000000..c6c824356
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -0,0 +1,553 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+
+#include <linux/migrate.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include "../include/linux/lustre_compat25.h"
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage, unsigned int offset,
+ unsigned int length)
+{
+ struct inode *inode;
+ struct lu_env *env;
+ struct cl_page *page;
+ struct cl_object *obj;
+
+ int refcheck;
+
+ LASSERT(PageLocked(vmpage));
+ LASSERT(!PageWriteback(vmpage));
+
+ /*
+ * It is safe to not check anything in invalidatepage/releasepage
+ * below because they are run with page locked and all our io is
+ * happening with locked page too
+ */
+ if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ inode = vmpage->mapping->host;
+ obj = ll_i2info(inode)->lli_clob;
+ if (obj != NULL) {
+ page = cl_vmpage_page(vmpage, obj);
+ if (page != NULL) {
+ lu_ref_add(&page->cp_reference,
+ "delete", vmpage);
+ cl_page_delete(env, page);
+ lu_ref_del(&page->cp_reference,
+ "delete", vmpage);
+ cl_page_put(env, page);
+ }
+ } else
+ LASSERT(vmpage->private == 0);
+ cl_env_put(env, &refcheck);
+ }
+ }
+}
+
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ struct cl_object *obj;
+ struct cl_page *page;
+ struct address_space *mapping;
+ int result;
+
+ LASSERT(PageLocked(vmpage));
+ if (PageWriteback(vmpage) || PageDirty(vmpage))
+ return 0;
+
+ mapping = vmpage->mapping;
+ if (mapping == NULL)
+ return 1;
+
+ obj = ll_i2info(mapping->host)->lli_clob;
+ if (obj == NULL)
+ return 1;
+
+ /* 1 for page allocator, 1 for cl_page and 1 for page cache */
+ if (page_count(vmpage) > 3)
+ return 0;
+
+ /* TODO: determine what gfp should be used by @gfp_mask. */
+ env = cl_env_nested_get(&nest);
+ if (IS_ERR(env))
+ /* If we can't allocate an env we won't call cl_page_put()
+ * later on which further means it's impossible to drop
+ * page refcount by cl_page, so ask kernel to not free
+ * this page. */
+ return 0;
+
+ page = cl_vmpage_page(vmpage, obj);
+ result = page == NULL;
+ if (page != NULL) {
+ if (!cl_page_in_use(page)) {
+ result = 1;
+ cl_page_delete(env, page);
+ }
+ cl_page_put(env, page);
+ }
+ cl_env_nested_put(&nest, env);
+ return result;
+}
+
+static int ll_set_page_dirty(struct page *vmpage)
+{
+#if 0
+ struct cl_page *page = vvp_vmpage_page_transient(vmpage);
+ struct vvp_object *obj = cl_inode2vvp(vmpage->mapping->host);
+ struct vvp_page *cpg;
+
+ /*
+ * XXX should page method be called here?
+ */
+ LASSERT(&obj->co_cl == page->cp_obj);
+ cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+ /*
+ * XXX cannot do much here, because page is possibly not locked:
+ * sys_munmap()->...
+ * ->unmap_page_range()->zap_pte_range()->set_page_dirty().
+ */
+ vvp_write_pending(obj, cpg);
+#endif
+ return __set_page_dirty_nobuffers(vmpage);
+}
+
+#define MAX_DIRECTIO_SIZE (2*1024*1024*1024UL)
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+ size_t size, struct page ***pages,
+ int *max_pages)
+{
+ int result = -ENOMEM;
+
+ /* set an arbitrary limit to prevent arithmetic overflow */
+ if (size > MAX_DIRECTIO_SIZE) {
+ *pages = NULL;
+ return -EFBIG;
+ }
+
+ *max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ *max_pages -= user_addr >> PAGE_CACHE_SHIFT;
+
+ OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
+ if (*pages) {
+ result = get_user_pages_fast(user_addr, *max_pages,
+ (rw == READ), *pages);
+ if (unlikely(result <= 0))
+ OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
+ }
+
+ return result;
+}
+
+/* ll_free_user_pages - tear down page struct array
+ * @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+ int i;
+
+ for (i = 0; i < npages; i++) {
+ if (do_dirty)
+ set_page_dirty_lock(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ kvfree(pages);
+}
+
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+ int rw, struct inode *inode,
+ struct ll_dio_pages *pv)
+{
+ struct cl_page *clp;
+ struct cl_2queue *queue;
+ struct cl_object *obj = io->ci_obj;
+ int i;
+ ssize_t rc = 0;
+ loff_t file_offset = pv->ldp_start_offset;
+ long size = pv->ldp_size;
+ int page_count = pv->ldp_nr;
+ struct page **pages = pv->ldp_pages;
+ long page_size = cl_page_size(obj);
+ bool do_io;
+ int io_pages = 0;
+
+ queue = &io->ci_queue;
+ cl_2queue_init(queue);
+ for (i = 0; i < page_count; i++) {
+ if (pv->ldp_offsets)
+ file_offset = pv->ldp_offsets[i];
+
+ LASSERT(!(file_offset & (page_size - 1)));
+ clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+ pv->ldp_pages[i], CPT_TRANSIENT);
+ if (IS_ERR(clp)) {
+ rc = PTR_ERR(clp);
+ break;
+ }
+
+ rc = cl_page_own(env, io, clp);
+ if (rc) {
+ LASSERT(clp->cp_state == CPS_FREEING);
+ cl_page_put(env, clp);
+ break;
+ }
+
+ do_io = true;
+
+ /* check the page type: if the page is a host page, then do
+ * write directly */
+ if (clp->cp_type == CPT_CACHEABLE) {
+ struct page *vmpage = cl_page_vmpage(env, clp);
+ struct page *src_page;
+ struct page *dst_page;
+ void *src;
+ void *dst;
+
+ src_page = (rw == WRITE) ? pages[i] : vmpage;
+ dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+ src = kmap_atomic(src_page);
+ dst = kmap_atomic(dst_page);
+ memcpy(dst, src, min(page_size, size));
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+
+ /* make sure page will be added to the transfer by
+ * cl_io_submit()->...->vvp_page_prep_write(). */
+ if (rw == WRITE)
+ set_page_dirty(vmpage);
+
+ if (rw == READ) {
+ /* do not issue the page for read, since it
+ * may reread a ra page which has NOT uptodate
+ * bit set. */
+ cl_page_disown(env, io, clp);
+ do_io = false;
+ }
+ }
+
+ if (likely(do_io)) {
+ cl_2queue_add(queue, clp);
+
+ /*
+ * Set page clip to tell transfer formation engine
+ * that page has to be sent even if it is beyond KMS.
+ */
+ cl_page_clip(env, clp, 0, min(size, page_size));
+
+ ++io_pages;
+ }
+
+ /* drop the reference count for cl_page_find */
+ cl_page_put(env, clp);
+ size -= page_size;
+ file_offset += page_size;
+ }
+
+ if (rc == 0 && io_pages) {
+ rc = cl_io_submit_sync(env, io,
+ rw == READ ? CRT_READ : CRT_WRITE,
+ queue, 0);
+ }
+ if (rc == 0)
+ rc = pv->ldp_size;
+
+ cl_2queue_discard(env, io, queue);
+ cl_2queue_disown(env, io, queue);
+ cl_2queue_fini(env, queue);
+ return rc;
+}
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+ int rw, struct inode *inode,
+ struct address_space *mapping,
+ size_t size, loff_t file_offset,
+ struct page **pages, int page_count)
+{
+ struct ll_dio_pages pvec = { .ldp_pages = pages,
+ .ldp_nr = page_count,
+ .ldp_size = size,
+ .ldp_offsets = NULL,
+ .ldp_start_offset = file_offset
+ };
+
+ return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit. We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC. For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
+ ~(DT_MAX_BRW_SIZE - 1))
+static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t file_offset)
+{
+ struct lu_env *env;
+ struct cl_io *io;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct ccc_object *obj = cl_inode2ccc(inode);
+ ssize_t count = iov_iter_count(iter);
+ ssize_t tot_bytes = 0, result = 0;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ long size = MAX_DIO_SIZE;
+ int refcheck;
+
+ if (!lli->lli_has_smd)
+ return -EBADF;
+
+ /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+ if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
+ return -EINVAL;
+
+ CDEBUG(D_VFSTRACE,
+ "VFS Op:inode=%lu/%u(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n",
+ inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
+ file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
+ MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
+
+ /* Check that all user buffers are aligned as well */
+ if (iov_iter_alignment(iter) & ~CFS_PAGE_MASK)
+ return -EINVAL;
+
+ env = cl_env_get(&refcheck);
+ LASSERT(!IS_ERR(env));
+ io = ccc_env_io(env)->cui_cl.cis_io;
+ LASSERT(io != NULL);
+
+ /* 0. Need locking between buffered and direct access. and race with
+ * size changing by concurrent truncates and writes.
+ * 1. Need inode mutex to operate transient pages.
+ */
+ if (iov_iter_rw(iter) == READ)
+ mutex_lock(&inode->i_mutex);
+
+ LASSERT(obj->cob_transient_pages == 0);
+ while (iov_iter_count(iter)) {
+ struct page **pages;
+ size_t offs;
+
+ count = min_t(size_t, iov_iter_count(iter), size);
+ if (iov_iter_rw(iter) == READ) {
+ if (file_offset >= i_size_read(inode))
+ break;
+ if (file_offset + count > i_size_read(inode))
+ count = i_size_read(inode) - file_offset;
+ }
+
+ result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
+ if (likely(result > 0)) {
+ int n = DIV_ROUND_UP(result + offs, PAGE_SIZE);
+ result = ll_direct_IO_26_seg(env, io, iov_iter_rw(iter),
+ inode, file->f_mapping,
+ result, file_offset, pages,
+ n);
+ ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ);
+ }
+ if (unlikely(result <= 0)) {
+ /* If we can't allocate a large enough buffer
+ * for the request, shrink it to a smaller
+ * PAGE_SIZE multiple and try again.
+ * We should always be able to kmalloc for a
+ * page worth of page pointers = 4MB on i386. */
+ if (result == -ENOMEM &&
+ size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+ PAGE_CACHE_SIZE) {
+ size = ((((size / 2) - 1) |
+ ~CFS_PAGE_MASK) + 1) &
+ CFS_PAGE_MASK;
+ CDEBUG(D_VFSTRACE, "DIO size now %lu\n",
+ size);
+ continue;
+ }
+
+ goto out;
+ }
+ iov_iter_advance(iter, result);
+ tot_bytes += result;
+ file_offset += result;
+ }
+out:
+ LASSERT(obj->cob_transient_pages == 0);
+ if (iov_iter_rw(iter) == READ)
+ mutex_unlock(&inode->i_mutex);
+
+ if (tot_bytes > 0) {
+ if (iov_iter_rw(iter) == WRITE) {
+ struct lov_stripe_md *lsm;
+
+ lsm = ccc_inode_lsm_get(inode);
+ LASSERT(lsm != NULL);
+ lov_stripe_lock(lsm);
+ obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0);
+ lov_stripe_unlock(lsm);
+ ccc_inode_lsm_put(inode, lsm);
+ }
+ }
+
+ cl_env_put(env, &refcheck);
+ return tot_bytes ? : result;
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ int rc;
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+
+ *pagep = page;
+
+ rc = ll_prepare_write(file, page, from, from + len);
+ if (rc) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return rc;
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ int rc;
+
+ rc = ll_commit_write(file, page, from, from + copied);
+ unlock_page(page);
+ page_cache_release(page);
+
+ return rc ?: copied;
+}
+
+#ifdef CONFIG_MIGRATION
+static int ll_migratepage(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode
+ )
+{
+ /* Always fail page migration until we have a proper implementation */
+ return -EIO;
+}
+#endif
+
+#ifndef MS_HAS_NEW_AOPS
+const struct address_space_operations ll_aops = {
+ .readpage = ll_readpage,
+ .direct_IO = ll_direct_IO_26,
+ .writepage = ll_writepage,
+ .writepages = ll_writepages,
+ .set_page_dirty = ll_set_page_dirty,
+ .write_begin = ll_write_begin,
+ .write_end = ll_write_end,
+ .invalidatepage = ll_invalidatepage,
+ .releasepage = (void *)ll_releasepage,
+#ifdef CONFIG_MIGRATION
+ .migratepage = ll_migratepage,
+#endif
+};
+#else
+const struct address_space_operations_ext ll_aops = {
+ .orig_aops.readpage = ll_readpage,
+/* .orig_aops.readpages = ll_readpages, */
+ .orig_aops.direct_IO = ll_direct_IO_26,
+ .orig_aops.writepage = ll_writepage,
+ .orig_aops.writepages = ll_writepages,
+ .orig_aops.set_page_dirty = ll_set_page_dirty,
+ .orig_aops.prepare_write = ll_prepare_write,
+ .orig_aops.commit_write = ll_commit_write,
+ .orig_aops.invalidatepage = ll_invalidatepage,
+ .orig_aops.releasepage = ll_releasepage,
+#ifdef CONFIG_MIGRATION
+ .orig_aops.migratepage = ll_migratepage,
+#endif
+ .write_begin = ll_write_begin,
+ .write_end = ll_write_end
+};
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c
new file mode 100644
index 000000000..7f8071242
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/statahead.c
@@ -0,0 +1,1729 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+ /** negative values are for error cases */
+ SA_ENTRY_INIT = 0, /** init entry */
+ SA_ENTRY_SUCC = 1, /** stat succeed */
+ SA_ENTRY_INVA = 2, /** invalid entry */
+ SA_ENTRY_DEST = 3, /** entry to be destroyed */
+} se_stat_t;
+
+struct ll_sa_entry {
+ /* link into sai->sai_entries */
+ struct list_head se_link;
+ /* link into sai->sai_entries_{received,stated} */
+ struct list_head se_list;
+ /* link into sai hash table locally */
+ struct list_head se_hash;
+ /* entry reference count */
+ atomic_t se_refcount;
+ /* entry index in the sai */
+ __u64 se_index;
+ /* low layer ldlm lock handle */
+ __u64 se_handle;
+ /* entry status */
+ se_stat_t se_stat;
+ /* entry size, contains name */
+ int se_size;
+ /* pointer to async getattr enqueue info */
+ struct md_enqueue_info *se_minfo;
+ /* pointer to the async getattr request */
+ struct ptlrpc_request *se_req;
+ /* pointer to the target inode */
+ struct inode *se_inode;
+ /* entry name */
+ struct qstr se_qstr;
+};
+
+static unsigned int sai_generation;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
+{
+ return list_empty(&entry->se_hash);
+}
+
+/*
+ * The entry only can be released by the caller, it is necessary to hold lock.
+ */
+static inline int ll_sa_entry_stated(struct ll_sa_entry *entry)
+{
+ smp_rmb();
+ return (entry->se_stat != SA_ENTRY_INIT);
+}
+
+static inline int ll_sa_entry_hash(int val)
+{
+ return val & LL_SA_CACHE_MASK;
+}
+
+/*
+ * Insert entry to hash SA table.
+ */
+static inline void
+ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+ int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+ spin_lock(&sai->sai_cache_lock[i]);
+ list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+ spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/*
+ * Remove entry from SA table.
+ */
+static inline void
+ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+ int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+ spin_lock(&sai->sai_cache_lock[i]);
+ list_del_init(&entry->se_hash);
+ spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+ struct inode *inode)
+{
+ return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid);
+}
+
+static inline struct ll_sa_entry *
+sa_first_received_entry(struct ll_statahead_info *sai)
+{
+ return list_entry(sai->sai_entries_received.next,
+ struct ll_sa_entry, se_list);
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+ return list_entry(sai->sai_entries_agl.next,
+ struct ll_inode_info, lli_agl_list);
+}
+
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+ return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+static inline int sa_received_empty(struct ll_statahead_info *sai)
+{
+ return list_empty(&sai->sai_entries_received);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+ return list_empty(&sai->sai_entries_agl);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+ return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+ (sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * If the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+ return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+ sai->sai_index);
+}
+
+/*
+ * Insert it into sai_entries tail when init.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index,
+ const char *name, int len)
+{
+ struct ll_inode_info *lli;
+ struct ll_sa_entry *entry;
+ int entry_size;
+ char *dname;
+
+ entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4;
+ entry = kzalloc(entry_size, GFP_NOFS);
+ if (unlikely(!entry))
+ return ERR_PTR(-ENOMEM);
+
+ CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n",
+ len, name, entry, index);
+
+ entry->se_index = index;
+
+ /*
+ * Statahead entry reference rules:
+ *
+ * 1) When statahead entry is initialized, its reference is set as 2.
+ * One reference is used by the directory scanner. When the scanner
+ * searches the statahead cache for the given name, it can perform
+ * lockless hash lookup (only the scanner can remove entry from hash
+ * list), and once found, it needn't to call "atomic_inc()" for the
+ * entry reference. So the performance is improved. After using the
+ * statahead entry, the scanner will call "atomic_dec()" to drop the
+ * reference held when initialization. If it is the last reference,
+ * the statahead entry will be freed.
+ *
+ * 2) All other threads, including statahead thread and ptlrpcd thread,
+ * when they process the statahead entry, the reference for target
+ * should be held to guarantee the entry will not be released by the
+ * directory scanner. After processing the entry, these threads will
+ * drop the entry reference. If it is the last reference, the entry
+ * will be freed.
+ *
+ * The second reference when initializes the statahead entry is used
+ * by the statahead thread, following the rule 2).
+ */
+ atomic_set(&entry->se_refcount, 2);
+ entry->se_stat = SA_ENTRY_INIT;
+ entry->se_size = entry_size;
+ dname = (char *)entry + sizeof(struct ll_sa_entry);
+ memcpy(dname, name, len);
+ dname[len] = 0;
+ entry->se_qstr.hash = full_name_hash(name, len);
+ entry->se_qstr.len = len;
+ entry->se_qstr.name = dname;
+
+ lli = ll_i2info(sai->sai_inode);
+ spin_lock(&lli->lli_sa_lock);
+ list_add_tail(&entry->se_link, &sai->sai_entries);
+ INIT_LIST_HEAD(&entry->se_list);
+ ll_sa_entry_enhash(sai, entry);
+ spin_unlock(&lli->lli_sa_lock);
+
+ atomic_inc(&sai->sai_cache_count);
+
+ return entry;
+}
+
+/*
+ * Used by the directory scanner to search entry with name.
+ *
+ * Only the caller can remove the entry from hash, so it is unnecessary to hold
+ * hash lock. It is caller's duty to release the init refcount on the entry, so
+ * it is also unnecessary to increase refcount on the entry.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+ struct ll_sa_entry *entry;
+ int i = ll_sa_entry_hash(qstr->hash);
+
+ list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+ if (entry->se_qstr.hash == qstr->hash &&
+ entry->se_qstr.len == qstr->len &&
+ memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Used by the async getattr request callback to find entry with index.
+ *
+ * Inside lli_sa_lock to prevent others to change the list during the search.
+ * It needs to increase entry refcount before returning to guarantee that the
+ * entry cannot be freed by others.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index)
+{
+ struct ll_sa_entry *entry;
+
+ list_for_each_entry(entry, &sai->sai_entries, se_link) {
+ if (entry->se_index == index) {
+ LASSERT(atomic_read(&entry->se_refcount) > 0);
+ atomic_inc(&entry->se_refcount);
+ return entry;
+ }
+ if (entry->se_index > index)
+ break;
+ }
+ return NULL;
+}
+
+static void ll_sa_entry_cleanup(struct ll_statahead_info *sai,
+ struct ll_sa_entry *entry)
+{
+ struct md_enqueue_info *minfo = entry->se_minfo;
+ struct ptlrpc_request *req = entry->se_req;
+
+ if (minfo) {
+ entry->se_minfo = NULL;
+ ll_intent_release(&minfo->mi_it);
+ iput(minfo->mi_dir);
+ OBD_FREE_PTR(minfo);
+ }
+
+ if (req) {
+ entry->se_req = NULL;
+ ptlrpc_req_finished(req);
+ }
+}
+
+static void ll_sa_entry_put(struct ll_statahead_info *sai,
+ struct ll_sa_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->se_refcount)) {
+ CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n",
+ entry->se_qstr.len, entry->se_qstr.name, entry,
+ entry->se_index);
+
+ LASSERT(list_empty(&entry->se_link));
+ LASSERT(list_empty(&entry->se_list));
+ LASSERT(ll_sa_entry_unhashed(entry));
+
+ ll_sa_entry_cleanup(sai, entry);
+ iput(entry->se_inode);
+
+ OBD_FREE(entry, entry->se_size);
+ atomic_dec(&sai->sai_cache_count);
+ }
+}
+
+static inline void
+do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+ struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+ LASSERT(!ll_sa_entry_unhashed(entry));
+ LASSERT(!list_empty(&entry->se_link));
+
+ ll_sa_entry_unhash(sai, entry);
+
+ spin_lock(&lli->lli_sa_lock);
+ entry->se_stat = SA_ENTRY_DEST;
+ list_del_init(&entry->se_link);
+ if (likely(!list_empty(&entry->se_list)))
+ list_del_init(&entry->se_list);
+ spin_unlock(&lli->lli_sa_lock);
+
+ ll_sa_entry_put(sai, entry);
+}
+
+/*
+ * Delete it from sai_entries_stated list when fini.
+ */
+static void
+ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+ struct ll_sa_entry *pos, *next;
+
+ if (entry)
+ do_sa_entry_fini(sai, entry);
+
+ /* drop old entry, only 'scanner' process does this, no need to lock */
+ list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
+ if (!is_omitted_entry(sai, pos->se_index))
+ break;
+ do_sa_entry_fini(sai, pos);
+ }
+}
+
+/*
+ * Inside lli_sa_lock.
+ */
+static void
+do_sa_entry_to_stated(struct ll_statahead_info *sai,
+ struct ll_sa_entry *entry, se_stat_t stat)
+{
+ struct ll_sa_entry *se;
+ struct list_head *pos = &sai->sai_entries_stated;
+
+ if (!list_empty(&entry->se_list))
+ list_del_init(&entry->se_list);
+
+ list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) {
+ if (se->se_index < entry->se_index) {
+ pos = &se->se_list;
+ break;
+ }
+ }
+
+ list_add(&entry->se_list, pos);
+ entry->se_stat = stat;
+}
+
+/*
+ * Move entry to sai_entries_stated and sort with the index.
+ * \retval 1 -- entry to be destroyed.
+ * \retval 0 -- entry is inserted into stated list.
+ */
+static int
+ll_sa_entry_to_stated(struct ll_statahead_info *sai,
+ struct ll_sa_entry *entry, se_stat_t stat)
+{
+ struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+ int ret = 1;
+
+ ll_sa_entry_cleanup(sai, entry);
+
+ spin_lock(&lli->lli_sa_lock);
+ if (likely(entry->se_stat != SA_ENTRY_DEST)) {
+ do_sa_entry_to_stated(sai, entry, stat);
+ ret = 0;
+ }
+ spin_unlock(&lli->lli_sa_lock);
+
+ return ret;
+}
+
+/*
+ * Insert inode into the list of sai_entries_agl.
+ */
+static void ll_agl_add(struct ll_statahead_info *sai,
+ struct inode *inode, int index)
+{
+ struct ll_inode_info *child = ll_i2info(inode);
+ struct ll_inode_info *parent = ll_i2info(sai->sai_inode);
+ int added = 0;
+
+ spin_lock(&child->lli_agl_lock);
+ if (child->lli_agl_index == 0) {
+ child->lli_agl_index = index;
+ spin_unlock(&child->lli_agl_lock);
+
+ LASSERT(list_empty(&child->lli_agl_list));
+
+ igrab(inode);
+ spin_lock(&parent->lli_agl_lock);
+ if (agl_list_empty(sai))
+ added = 1;
+ list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl);
+ spin_unlock(&parent->lli_agl_lock);
+ } else {
+ spin_unlock(&child->lli_agl_lock);
+ }
+
+ if (added > 0)
+ wake_up(&sai->sai_agl_thread.t_ctl_waitq);
+}
+
+static struct ll_statahead_info *ll_sai_alloc(void)
+{
+ struct ll_statahead_info *sai;
+ int i;
+
+ sai = kzalloc(sizeof(*sai), GFP_NOFS);
+ if (!sai)
+ return NULL;
+
+ atomic_set(&sai->sai_refcount, 1);
+
+ spin_lock(&sai_generation_lock);
+ sai->sai_generation = ++sai_generation;
+ if (unlikely(sai_generation == 0))
+ sai->sai_generation = ++sai_generation;
+ spin_unlock(&sai_generation_lock);
+
+ sai->sai_max = LL_SA_RPC_MIN;
+ sai->sai_index = 1;
+ init_waitqueue_head(&sai->sai_waitq);
+ init_waitqueue_head(&sai->sai_thread.t_ctl_waitq);
+ init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq);
+
+ INIT_LIST_HEAD(&sai->sai_entries);
+ INIT_LIST_HEAD(&sai->sai_entries_received);
+ INIT_LIST_HEAD(&sai->sai_entries_stated);
+ INIT_LIST_HEAD(&sai->sai_entries_agl);
+
+ for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+ INIT_LIST_HEAD(&sai->sai_cache[i]);
+ spin_lock_init(&sai->sai_cache_lock[i]);
+ }
+ atomic_set(&sai->sai_cache_count, 0);
+
+ return sai;
+}
+
+static inline struct ll_statahead_info *
+ll_sai_get(struct ll_statahead_info *sai)
+{
+ atomic_inc(&sai->sai_refcount);
+ return sai;
+}
+
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+ struct inode *inode = sai->sai_inode;
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+ struct ll_sa_entry *entry, *next;
+
+ if (unlikely(atomic_read(&sai->sai_refcount) > 0)) {
+ /* It is race case, the interpret callback just hold
+ * a reference count */
+ spin_unlock(&lli->lli_sa_lock);
+ return;
+ }
+
+ LASSERT(lli->lli_opendir_key == NULL);
+ LASSERT(thread_is_stopped(&sai->sai_thread));
+ LASSERT(thread_is_stopped(&sai->sai_agl_thread));
+
+ lli->lli_sai = NULL;
+ lli->lli_opendir_pid = 0;
+ spin_unlock(&lli->lli_sa_lock);
+
+ if (sai->sai_sent > sai->sai_replied)
+ CDEBUG(D_READA, "statahead for dir "DFID
+ " does not finish: [sent:%llu] [replied:%llu]\n",
+ PFID(&lli->lli_fid),
+ sai->sai_sent, sai->sai_replied);
+
+ list_for_each_entry_safe(entry, next,
+ &sai->sai_entries, se_link)
+ do_sa_entry_fini(sai, entry);
+
+ LASSERT(list_empty(&sai->sai_entries));
+ LASSERT(sa_received_empty(sai));
+ LASSERT(list_empty(&sai->sai_entries_stated));
+
+ LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+ LASSERT(agl_list_empty(sai));
+
+ iput(inode);
+ OBD_FREE_PTR(sai);
+ }
+}
+
+/* Do NOT forget to drop inode refcount when into sai_entries_agl. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ __u64 index = lli->lli_agl_index;
+ int rc;
+
+ LASSERT(list_empty(&lli->lli_agl_list));
+
+ /* AGL maybe fall behind statahead with one entry */
+ if (is_omitted_entry(sai, index + 1)) {
+ lli->lli_agl_index = 0;
+ iput(inode);
+ return;
+ }
+
+ /* Someone is in glimpse (sync or async), do nothing. */
+ rc = down_write_trylock(&lli->lli_glimpse_sem);
+ if (rc == 0) {
+ lli->lli_agl_index = 0;
+ iput(inode);
+ return;
+ }
+
+ /*
+ * Someone triggered glimpse within 1 sec before.
+ * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+ * if the lock is still cached on client, AGL needs to do nothing. If
+ * it is cancelled by other client, AGL maybe cannot obtain new lock
+ * for no glimpse callback triggered by AGL.
+ * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+ * Under such case, it is quite possible that the OST will not grant
+ * glimpse lock for AGL also.
+ * 3) The former glimpse failed, compared with other two cases, it is
+ * relative rare. AGL can ignore such case, and it will not muchly
+ * affect the performance.
+ */
+ if (lli->lli_glimpse_time != 0 &&
+ time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+ up_write(&lli->lli_glimpse_sem);
+ lli->lli_agl_index = 0;
+ iput(inode);
+ return;
+ }
+
+ CDEBUG(D_READA, "Handling (init) async glimpse: inode = "
+ DFID", idx = %llu\n", PFID(&lli->lli_fid), index);
+
+ cl_agl(inode);
+ lli->lli_agl_index = 0;
+ lli->lli_glimpse_time = cfs_time_current();
+ up_write(&lli->lli_glimpse_sem);
+
+ CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
+ DFID", idx = %llu, rc = %d\n",
+ PFID(&lli->lli_fid), index, rc);
+
+ iput(inode);
+}
+
+static void ll_post_statahead(struct ll_statahead_info *sai)
+{
+ struct inode *dir = sai->sai_inode;
+ struct inode *child;
+ struct ll_inode_info *lli = ll_i2info(dir);
+ struct ll_sa_entry *entry;
+ struct md_enqueue_info *minfo;
+ struct lookup_intent *it;
+ struct ptlrpc_request *req;
+ struct mdt_body *body;
+ int rc = 0;
+
+ spin_lock(&lli->lli_sa_lock);
+ if (unlikely(sa_received_empty(sai))) {
+ spin_unlock(&lli->lli_sa_lock);
+ return;
+ }
+ entry = sa_first_received_entry(sai);
+ atomic_inc(&entry->se_refcount);
+ list_del_init(&entry->se_list);
+ spin_unlock(&lli->lli_sa_lock);
+
+ LASSERT(entry->se_handle != 0);
+
+ minfo = entry->se_minfo;
+ it = &minfo->mi_it;
+ req = entry->se_req;
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ child = entry->se_inode;
+ if (child == NULL) {
+ /*
+ * lookup.
+ */
+ LASSERT(fid_is_zero(&minfo->mi_data.op_fid2));
+
+ /* XXX: No fid in reply, this is probably cross-ref case.
+ * SA can't handle it yet. */
+ if (body->valid & OBD_MD_MDS) {
+ rc = -EAGAIN;
+ goto out;
+ }
+ } else {
+ /*
+ * revalidate.
+ */
+ /* unlinked and re-created with the same name */
+ if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){
+ entry->se_inode = NULL;
+ iput(child);
+ child = NULL;
+ }
+ }
+
+ it->d.lustre.it_lock_handle = entry->se_handle;
+ rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+ if (rc != 1) {
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ rc = ll_prep_inode(&child, req, dir->i_sb, it);
+ if (rc)
+ goto out;
+
+ CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+ child, child->i_ino, child->i_generation);
+ ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+ entry->se_inode = child;
+
+ if (agl_should_run(sai, child))
+ ll_agl_add(sai, child, entry->se_index);
+
+out:
+ /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock
+ * reference count by calling "ll_intent_drop_lock()" in spite of the
+ * above operations failed or not. Do not worry about calling
+ * "ll_intent_drop_lock()" more than once. */
+ rc = ll_sa_entry_to_stated(sai, entry,
+ rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+ if (rc == 0 && entry->se_index == sai->sai_index_wait)
+ wake_up(&sai->sai_waitq);
+ ll_sa_entry_put(sai, entry);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+ struct md_enqueue_info *minfo, int rc)
+{
+ struct lookup_intent *it = &minfo->mi_it;
+ struct inode *dir = minfo->mi_dir;
+ struct ll_inode_info *lli = ll_i2info(dir);
+ struct ll_statahead_info *sai = NULL;
+ struct ll_sa_entry *entry;
+ __u64 handle = 0;
+ int wakeup;
+
+ if (it_disposition(it, DISP_LOOKUP_NEG))
+ rc = -ENOENT;
+
+ if (rc == 0) {
+ /* release ibits lock ASAP to avoid deadlock when statahead
+ * thread enqueues lock on parent in readdir and another
+ * process enqueues lock on child with parent lock held, eg.
+ * unlink. */
+ handle = it->d.lustre.it_lock_handle;
+ ll_intent_drop_lock(it);
+ }
+
+ spin_lock(&lli->lli_sa_lock);
+ /* stale entry */
+ if (unlikely(lli->lli_sai == NULL ||
+ lli->lli_sai->sai_generation != minfo->mi_generation)) {
+ spin_unlock(&lli->lli_sa_lock);
+ rc = -ESTALE;
+ goto out;
+ } else {
+ sai = ll_sai_get(lli->lli_sai);
+ if (unlikely(!thread_is_running(&sai->sai_thread))) {
+ sai->sai_replied++;
+ spin_unlock(&lli->lli_sa_lock);
+ rc = -EBADFD;
+ goto out;
+ }
+
+ entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata);
+ if (entry == NULL) {
+ sai->sai_replied++;
+ spin_unlock(&lli->lli_sa_lock);
+ rc = -EIDRM;
+ goto out;
+ }
+
+ if (rc != 0) {
+ do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA);
+ wakeup = (entry->se_index == sai->sai_index_wait);
+ } else {
+ entry->se_minfo = minfo;
+ entry->se_req = ptlrpc_request_addref(req);
+ /* Release the async ibits lock ASAP to avoid deadlock
+ * when statahead thread tries to enqueue lock on parent
+ * for readpage and other tries to enqueue lock on child
+ * with parent's lock held, for example: unlink. */
+ entry->se_handle = handle;
+ wakeup = sa_received_empty(sai);
+ list_add_tail(&entry->se_list,
+ &sai->sai_entries_received);
+ }
+ sai->sai_replied++;
+ spin_unlock(&lli->lli_sa_lock);
+
+ ll_sa_entry_put(sai, entry);
+ if (wakeup)
+ wake_up(&sai->sai_thread.t_ctl_waitq);
+ }
+
+out:
+ if (rc != 0) {
+ ll_intent_release(it);
+ iput(dir);
+ OBD_FREE_PTR(minfo);
+ }
+ if (sai != NULL)
+ ll_sai_put(sai);
+ return rc;
+}
+
+static void sa_args_fini(struct md_enqueue_info *minfo,
+ struct ldlm_enqueue_info *einfo)
+{
+ LASSERT(minfo && einfo);
+ iput(minfo->mi_dir);
+ capa_put(minfo->mi_data.op_capa1);
+ capa_put(minfo->mi_data.op_capa2);
+ OBD_FREE_PTR(minfo);
+ OBD_FREE_PTR(einfo);
+}
+
+/**
+ * There is race condition between "capa_put" and "ll_statahead_interpret" for
+ * accessing "op_data.op_capa[1,2]" as following:
+ * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling
+ * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and
+ * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid
+ * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling
+ * "md_intent_getattr_async".
+ */
+static int sa_args_init(struct inode *dir, struct inode *child,
+ struct ll_sa_entry *entry, struct md_enqueue_info **pmi,
+ struct ldlm_enqueue_info **pei,
+ struct obd_capa **pcapa)
+{
+ struct qstr *qstr = &entry->se_qstr;
+ struct ll_inode_info *lli = ll_i2info(dir);
+ struct md_enqueue_info *minfo;
+ struct ldlm_enqueue_info *einfo;
+ struct md_op_data *op_data;
+
+ einfo = kzalloc(sizeof(*einfo), GFP_NOFS);
+ if (!einfo)
+ return -ENOMEM;
+
+ minfo = kzalloc(sizeof(*minfo), GFP_NOFS);
+ if (!minfo) {
+ OBD_FREE_PTR(einfo);
+ return -ENOMEM;
+ }
+
+ op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name,
+ qstr->len, 0, LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data)) {
+ OBD_FREE_PTR(einfo);
+ OBD_FREE_PTR(minfo);
+ return PTR_ERR(op_data);
+ }
+
+ minfo->mi_it.it_op = IT_GETATTR;
+ minfo->mi_dir = igrab(dir);
+ minfo->mi_cb = ll_statahead_interpret;
+ minfo->mi_generation = lli->lli_sai->sai_generation;
+ minfo->mi_cbdata = entry->se_index;
+
+ einfo->ei_type = LDLM_IBITS;
+ einfo->ei_mode = it_to_lock_mode(&minfo->mi_it);
+ einfo->ei_cb_bl = ll_md_blocking_ast;
+ einfo->ei_cb_cp = ldlm_completion_ast;
+ einfo->ei_cb_gl = NULL;
+ einfo->ei_cbdata = NULL;
+
+ *pmi = minfo;
+ *pei = einfo;
+ pcapa[0] = op_data->op_capa1;
+ pcapa[1] = op_data->op_capa2;
+
+ return 0;
+}
+
+static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry)
+{
+ struct md_enqueue_info *minfo;
+ struct ldlm_enqueue_info *einfo;
+ struct obd_capa *capas[2];
+ int rc;
+
+ rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas);
+ if (rc)
+ return rc;
+
+ rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+ if (!rc) {
+ capa_put(capas[0]);
+ capa_put(capas[1]);
+ } else {
+ sa_args_fini(minfo, einfo);
+ }
+
+ return rc;
+}
+
+/**
+ * similar to ll_revalidate_it().
+ * \retval 1 -- dentry valid
+ * \retval 0 -- will send stat-ahead request
+ * \retval others -- prepare stat-ahead request failed
+ */
+static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry,
+ struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct lookup_intent it = { .it_op = IT_GETATTR,
+ .d.lustre.it_lock_handle = 0 };
+ struct md_enqueue_info *minfo;
+ struct ldlm_enqueue_info *einfo;
+ struct obd_capa *capas[2];
+ int rc;
+
+ if (unlikely(inode == NULL))
+ return 1;
+
+ if (d_mountpoint(dentry))
+ return 1;
+
+ entry->se_inode = igrab(inode);
+ rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
+ NULL);
+ if (rc == 1) {
+ entry->se_handle = it.d.lustre.it_lock_handle;
+ ll_intent_release(&it);
+ return 1;
+ }
+
+ rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas);
+ if (rc) {
+ entry->se_inode = NULL;
+ iput(inode);
+ return rc;
+ }
+
+ rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+ if (!rc) {
+ capa_put(capas[0]);
+ capa_put(capas[1]);
+ } else {
+ entry->se_inode = NULL;
+ iput(inode);
+ sa_args_fini(minfo, einfo);
+ }
+
+ return rc;
+}
+
+static void ll_statahead_one(struct dentry *parent, const char *entry_name,
+ int entry_name_len)
+{
+ struct inode *dir = d_inode(parent);
+ struct ll_inode_info *lli = ll_i2info(dir);
+ struct ll_statahead_info *sai = lli->lli_sai;
+ struct dentry *dentry = NULL;
+ struct ll_sa_entry *entry;
+ int rc;
+ int rc1;
+
+ entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name,
+ entry_name_len);
+ if (IS_ERR(entry))
+ return;
+
+ dentry = d_lookup(parent, &entry->se_qstr);
+ if (!dentry) {
+ rc = do_sa_lookup(dir, entry);
+ } else {
+ rc = do_sa_revalidate(dir, entry, dentry);
+ if (rc == 1 && agl_should_run(sai, d_inode(dentry)))
+ ll_agl_add(sai, d_inode(dentry), entry->se_index);
+ }
+
+ if (dentry != NULL)
+ dput(dentry);
+
+ if (rc) {
+ rc1 = ll_sa_entry_to_stated(sai, entry,
+ rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+ if (rc1 == 0 && entry->se_index == sai->sai_index_wait)
+ wake_up(&sai->sai_waitq);
+ } else {
+ sai->sai_sent++;
+ }
+
+ sai->sai_index++;
+ /* drop one refcount on entry by ll_sa_entry_alloc */
+ ll_sa_entry_put(sai, entry);
+}
+
+static int ll_agl_thread(void *arg)
+{
+ struct dentry *parent = (struct dentry *)arg;
+ struct inode *dir = d_inode(parent);
+ struct ll_inode_info *plli = ll_i2info(dir);
+ struct ll_inode_info *clli;
+ struct ll_sb_info *sbi = ll_i2sbi(dir);
+ struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai);
+ struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+ struct l_wait_info lwi = { 0 };
+
+ thread->t_pid = current_pid();
+ CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n",
+ sai, parent);
+
+ atomic_inc(&sbi->ll_agl_total);
+ spin_lock(&plli->lli_agl_lock);
+ sai->sai_agl_valid = 1;
+ if (thread_is_init(thread))
+ /* If someone else has changed the thread state
+ * (e.g. already changed to SVC_STOPPING), we can't just
+ * blindly overwrite that setting. */
+ thread_set_flags(thread, SVC_RUNNING);
+ spin_unlock(&plli->lli_agl_lock);
+ wake_up(&thread->t_ctl_waitq);
+
+ while (1) {
+ l_wait_event(thread->t_ctl_waitq,
+ !agl_list_empty(sai) ||
+ !thread_is_running(thread),
+ &lwi);
+
+ if (!thread_is_running(thread))
+ break;
+
+ spin_lock(&plli->lli_agl_lock);
+ /* The statahead thread maybe help to process AGL entries,
+ * so check whether list empty again. */
+ if (!agl_list_empty(sai)) {
+ clli = agl_first_entry(sai);
+ list_del_init(&clli->lli_agl_list);
+ spin_unlock(&plli->lli_agl_lock);
+ ll_agl_trigger(&clli->lli_vfs_inode, sai);
+ } else {
+ spin_unlock(&plli->lli_agl_lock);
+ }
+ }
+
+ spin_lock(&plli->lli_agl_lock);
+ sai->sai_agl_valid = 0;
+ while (!agl_list_empty(sai)) {
+ clli = agl_first_entry(sai);
+ list_del_init(&clli->lli_agl_list);
+ spin_unlock(&plli->lli_agl_lock);
+ clli->lli_agl_index = 0;
+ iput(&clli->lli_vfs_inode);
+ spin_lock(&plli->lli_agl_lock);
+ }
+ thread_set_flags(thread, SVC_STOPPED);
+ spin_unlock(&plli->lli_agl_lock);
+ wake_up(&thread->t_ctl_waitq);
+ ll_sai_put(sai);
+ CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n",
+ sai, parent);
+ return 0;
+}
+
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+ struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+ struct l_wait_info lwi = { 0 };
+ struct ll_inode_info *plli;
+ struct task_struct *task;
+
+ CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n",
+ sai, parent);
+
+ plli = ll_i2info(d_inode(parent));
+ task = kthread_run(ll_agl_thread, parent,
+ "ll_agl_%u", plli->lli_opendir_pid);
+ if (IS_ERR(task)) {
+ CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+ thread_set_flags(thread, SVC_STOPPED);
+ return;
+ }
+
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_running(thread) || thread_is_stopped(thread),
+ &lwi);
+}
+
+static int ll_statahead_thread(void *arg)
+{
+ struct dentry *parent = (struct dentry *)arg;
+ struct inode *dir = d_inode(parent);
+ struct ll_inode_info *plli = ll_i2info(dir);
+ struct ll_inode_info *clli;
+ struct ll_sb_info *sbi = ll_i2sbi(dir);
+ struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai);
+ struct ptlrpc_thread *thread = &sai->sai_thread;
+ struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
+ struct page *page;
+ __u64 pos = 0;
+ int first = 0;
+ int rc = 0;
+ struct ll_dir_chain chain;
+ struct l_wait_info lwi = { 0 };
+
+ thread->t_pid = current_pid();
+ CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n",
+ sai, parent);
+
+ if (sbi->ll_flags & LL_SBI_AGL_ENABLED)
+ ll_start_agl(parent, sai);
+
+ atomic_inc(&sbi->ll_sa_total);
+ spin_lock(&plli->lli_sa_lock);
+ if (thread_is_init(thread))
+ /* If someone else has changed the thread state
+ * (e.g. already changed to SVC_STOPPING), we can't just
+ * blindly overwrite that setting. */
+ thread_set_flags(thread, SVC_RUNNING);
+ spin_unlock(&plli->lli_sa_lock);
+ wake_up(&thread->t_ctl_waitq);
+
+ ll_dir_chain_init(&chain);
+ page = ll_get_dir_page(dir, pos, &chain);
+
+ while (1) {
+ struct lu_dirpage *dp;
+ struct lu_dirent *ent;
+
+ if (IS_ERR(page)) {
+ rc = PTR_ERR(page);
+ CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: [rc %d] [parent %u]\n",
+ PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+ rc, plli->lli_opendir_pid);
+ goto out;
+ }
+
+ dp = page_address(page);
+ for (ent = lu_dirent_start(dp); ent != NULL;
+ ent = lu_dirent_next(ent)) {
+ __u64 hash;
+ int namelen;
+ char *name;
+
+ hash = le64_to_cpu(ent->lde_hash);
+ if (unlikely(hash < pos))
+ /*
+ * Skip until we find target hash value.
+ */
+ continue;
+
+ namelen = le16_to_cpu(ent->lde_namelen);
+ if (unlikely(namelen == 0))
+ /*
+ * Skip dummy record.
+ */
+ continue;
+
+ name = ent->lde_name;
+ if (name[0] == '.') {
+ if (namelen == 1) {
+ /*
+ * skip "."
+ */
+ continue;
+ } else if (name[1] == '.' && namelen == 2) {
+ /*
+ * skip ".."
+ */
+ continue;
+ } else if (!sai->sai_ls_all) {
+ /*
+ * skip hidden files.
+ */
+ sai->sai_skip_hidden++;
+ continue;
+ }
+ }
+
+ /*
+ * don't stat-ahead first entry.
+ */
+ if (unlikely(++first == 1))
+ continue;
+
+keep_it:
+ l_wait_event(thread->t_ctl_waitq,
+ !sa_sent_full(sai) ||
+ !sa_received_empty(sai) ||
+ !agl_list_empty(sai) ||
+ !thread_is_running(thread),
+ &lwi);
+
+interpret_it:
+ while (!sa_received_empty(sai))
+ ll_post_statahead(sai);
+
+ if (unlikely(!thread_is_running(thread))) {
+ ll_release_page(page, 0);
+ rc = 0;
+ goto out;
+ }
+
+ /* If no window for metadata statahead, but there are
+ * some AGL entries to be triggered, then try to help
+ * to process the AGL entries. */
+ if (sa_sent_full(sai)) {
+ spin_lock(&plli->lli_agl_lock);
+ while (!agl_list_empty(sai)) {
+ clli = agl_first_entry(sai);
+ list_del_init(&clli->lli_agl_list);
+ spin_unlock(&plli->lli_agl_lock);
+ ll_agl_trigger(&clli->lli_vfs_inode,
+ sai);
+
+ if (!sa_received_empty(sai))
+ goto interpret_it;
+
+ if (unlikely(
+ !thread_is_running(thread))) {
+ ll_release_page(page, 0);
+ rc = 0;
+ goto out;
+ }
+
+ if (!sa_sent_full(sai))
+ goto do_it;
+
+ spin_lock(&plli->lli_agl_lock);
+ }
+ spin_unlock(&plli->lli_agl_lock);
+
+ goto keep_it;
+ }
+
+do_it:
+ ll_statahead_one(parent, name, namelen);
+ }
+ pos = le64_to_cpu(dp->ldp_hash_end);
+ if (pos == MDS_DIR_END_OFF) {
+ /*
+ * End of directory reached.
+ */
+ ll_release_page(page, 0);
+ while (1) {
+ l_wait_event(thread->t_ctl_waitq,
+ !sa_received_empty(sai) ||
+ sai->sai_sent == sai->sai_replied||
+ !thread_is_running(thread),
+ &lwi);
+
+ while (!sa_received_empty(sai))
+ ll_post_statahead(sai);
+
+ if (unlikely(!thread_is_running(thread))) {
+ rc = 0;
+ goto out;
+ }
+
+ if (sai->sai_sent == sai->sai_replied &&
+ sa_received_empty(sai))
+ break;
+ }
+
+ spin_lock(&plli->lli_agl_lock);
+ while (!agl_list_empty(sai) &&
+ thread_is_running(thread)) {
+ clli = agl_first_entry(sai);
+ list_del_init(&clli->lli_agl_list);
+ spin_unlock(&plli->lli_agl_lock);
+ ll_agl_trigger(&clli->lli_vfs_inode, sai);
+ spin_lock(&plli->lli_agl_lock);
+ }
+ spin_unlock(&plli->lli_agl_lock);
+
+ rc = 0;
+ goto out;
+ } else if (1) {
+ /*
+ * chain is exhausted.
+ * Normal case: continue to the next page.
+ */
+ ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+ LDF_COLLIDE);
+ page = ll_get_dir_page(dir, pos, &chain);
+ } else {
+ LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+ ll_release_page(page, 1);
+ /*
+ * go into overflow page.
+ */
+ }
+ }
+
+out:
+ if (sai->sai_agl_valid) {
+ spin_lock(&plli->lli_agl_lock);
+ thread_set_flags(agl_thread, SVC_STOPPING);
+ spin_unlock(&plli->lli_agl_lock);
+ wake_up(&agl_thread->t_ctl_waitq);
+
+ CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n",
+ sai, (unsigned int)agl_thread->t_pid);
+ l_wait_event(agl_thread->t_ctl_waitq,
+ thread_is_stopped(agl_thread),
+ &lwi);
+ } else {
+ /* Set agl_thread flags anyway. */
+ thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+ }
+ ll_dir_chain_fini(&chain);
+ spin_lock(&plli->lli_sa_lock);
+ if (!sa_received_empty(sai)) {
+ thread_set_flags(thread, SVC_STOPPING);
+ spin_unlock(&plli->lli_sa_lock);
+
+ /* To release the resources held by received entries. */
+ while (!sa_received_empty(sai))
+ ll_post_statahead(sai);
+
+ spin_lock(&plli->lli_sa_lock);
+ }
+ thread_set_flags(thread, SVC_STOPPED);
+ spin_unlock(&plli->lli_sa_lock);
+ wake_up(&sai->sai_waitq);
+ wake_up(&thread->t_ctl_waitq);
+ ll_sai_put(sai);
+ dput(parent);
+ CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n",
+ sai, parent);
+ return rc;
+}
+
+/**
+ * called in ll_file_release().
+ */
+void ll_stop_statahead(struct inode *dir, void *key)
+{
+ struct ll_inode_info *lli = ll_i2info(dir);
+
+ if (unlikely(key == NULL))
+ return;
+
+ spin_lock(&lli->lli_sa_lock);
+ if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) {
+ spin_unlock(&lli->lli_sa_lock);
+ return;
+ }
+
+ lli->lli_opendir_key = NULL;
+
+ if (lli->lli_sai) {
+ struct l_wait_info lwi = { 0 };
+ struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread;
+
+ if (!thread_is_stopped(thread)) {
+ thread_set_flags(thread, SVC_STOPPING);
+ spin_unlock(&lli->lli_sa_lock);
+ wake_up(&thread->t_ctl_waitq);
+
+ CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n",
+ lli->lli_sai, (unsigned int)thread->t_pid);
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_stopped(thread),
+ &lwi);
+ } else {
+ spin_unlock(&lli->lli_sa_lock);
+ }
+
+ /*
+ * Put the ref which was held when first statahead_enter.
+ * It maybe not the last ref for some statahead requests
+ * maybe inflight.
+ */
+ ll_sai_put(lli->lli_sai);
+ } else {
+ lli->lli_opendir_pid = 0;
+ spin_unlock(&lli->lli_sa_lock);
+ }
+}
+
+enum {
+ /**
+ * not first dirent, or is "."
+ */
+ LS_NONE_FIRST_DE = 0,
+ /**
+ * the first non-hidden dirent
+ */
+ LS_FIRST_DE,
+ /**
+ * the first hidden dirent, that is "."
+ */
+ LS_FIRST_DOT_DE
+};
+
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+ struct ll_dir_chain chain;
+ struct qstr *target = &dentry->d_name;
+ struct page *page;
+ __u64 pos = 0;
+ int dot_de;
+ int rc = LS_NONE_FIRST_DE;
+
+ ll_dir_chain_init(&chain);
+ page = ll_get_dir_page(dir, pos, &chain);
+
+ while (1) {
+ struct lu_dirpage *dp;
+ struct lu_dirent *ent;
+
+ if (IS_ERR(page)) {
+ struct ll_inode_info *lli = ll_i2info(dir);
+
+ rc = PTR_ERR(page);
+ CERROR("error reading dir "DFID" at %llu: [rc %d] [parent %u]\n",
+ PFID(ll_inode2fid(dir)), pos,
+ rc, lli->lli_opendir_pid);
+ break;
+ }
+
+ dp = page_address(page);
+ for (ent = lu_dirent_start(dp); ent != NULL;
+ ent = lu_dirent_next(ent)) {
+ __u64 hash;
+ int namelen;
+ char *name;
+
+ hash = le64_to_cpu(ent->lde_hash);
+ /* The ll_get_dir_page() can return any page containing
+ * the given hash which may be not the start hash. */
+ if (unlikely(hash < pos))
+ continue;
+
+ namelen = le16_to_cpu(ent->lde_namelen);
+ if (unlikely(namelen == 0))
+ /*
+ * skip dummy record.
+ */
+ continue;
+
+ name = ent->lde_name;
+ if (name[0] == '.') {
+ if (namelen == 1)
+ /*
+ * skip "."
+ */
+ continue;
+ else if (name[1] == '.' && namelen == 2)
+ /*
+ * skip ".."
+ */
+ continue;
+ else
+ dot_de = 1;
+ } else {
+ dot_de = 0;
+ }
+
+ if (dot_de && target->name[0] != '.') {
+ CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+ target->len, target->name,
+ namelen, name);
+ continue;
+ }
+
+ if (target->len != namelen ||
+ memcmp(target->name, name, namelen) != 0)
+ rc = LS_NONE_FIRST_DE;
+ else if (!dot_de)
+ rc = LS_FIRST_DE;
+ else
+ rc = LS_FIRST_DOT_DE;
+
+ ll_release_page(page, 0);
+ goto out;
+ }
+ pos = le64_to_cpu(dp->ldp_hash_end);
+ if (pos == MDS_DIR_END_OFF) {
+ /*
+ * End of directory reached.
+ */
+ ll_release_page(page, 0);
+ break;
+ } else if (1) {
+ /*
+ * chain is exhausted
+ * Normal case: continue to the next page.
+ */
+ ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+ LDF_COLLIDE);
+ page = ll_get_dir_page(dir, pos, &chain);
+ } else {
+ /*
+ * go into overflow page.
+ */
+ LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+ ll_release_page(page, 1);
+ }
+ }
+
+out:
+ ll_dir_chain_fini(&chain);
+ return rc;
+}
+
+static void
+ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+ struct ptlrpc_thread *thread = &sai->sai_thread;
+ struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode);
+ int hit;
+
+ if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
+ hit = 1;
+ else
+ hit = 0;
+
+ ll_sa_entry_fini(sai, entry);
+ if (hit) {
+ sai->sai_hit++;
+ sai->sai_consecutive_miss = 0;
+ sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+ } else {
+ struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+ sai->sai_miss++;
+ sai->sai_consecutive_miss++;
+ if (sa_low_hit(sai) && thread_is_running(thread)) {
+ atomic_inc(&sbi->ll_sa_wrong);
+ CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread\n",
+ PFID(&lli->lli_fid), sai->sai_hit,
+ sai->sai_miss, sai->sai_sent,
+ sai->sai_replied);
+ spin_lock(&lli->lli_sa_lock);
+ if (!thread_is_stopped(thread))
+ thread_set_flags(thread, SVC_STOPPING);
+ spin_unlock(&lli->lli_sa_lock);
+ }
+ }
+
+ if (!thread_is_stopped(thread))
+ wake_up(&thread->t_ctl_waitq);
+}
+
+/**
+ * Start statahead thread if this is the first dir entry.
+ * Otherwise if a thread is started already, wait it until it is ahead of me.
+ * \retval 1 -- find entry with lock in cache, the caller needs to do
+ * nothing.
+ * \retval 0 -- find entry in cache, but without lock, the caller needs
+ * refresh from MDS.
+ * \retval others -- the caller need to process as non-statahead.
+ */
+int do_statahead_enter(struct inode *dir, struct dentry **dentryp,
+ int only_unplug)
+{
+ struct ll_inode_info *lli = ll_i2info(dir);
+ struct ll_statahead_info *sai = lli->lli_sai;
+ struct dentry *parent;
+ struct ll_sa_entry *entry;
+ struct ptlrpc_thread *thread;
+ struct l_wait_info lwi = { 0 };
+ int rc = 0;
+ struct ll_inode_info *plli;
+
+ LASSERT(lli->lli_opendir_pid == current_pid());
+
+ if (sai) {
+ thread = &sai->sai_thread;
+ if (unlikely(thread_is_stopped(thread) &&
+ list_empty(&sai->sai_entries_stated))) {
+ /* to release resource */
+ ll_stop_statahead(dir, lli->lli_opendir_key);
+ return -EAGAIN;
+ }
+
+ if ((*dentryp)->d_name.name[0] == '.') {
+ if (sai->sai_ls_all ||
+ sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+ /*
+ * Hidden dentry is the first one, or statahead
+ * thread does not skip so many hidden dentries
+ * before "sai_ls_all" enabled as below.
+ */
+ } else {
+ if (!sai->sai_ls_all)
+ /*
+ * It maybe because hidden dentry is not
+ * the first one, "sai_ls_all" was not
+ * set, then "ls -al" missed. Enable
+ * "sai_ls_all" for such case.
+ */
+ sai->sai_ls_all = 1;
+
+ /*
+ * Such "getattr" has been skipped before
+ * "sai_ls_all" enabled as above.
+ */
+ sai->sai_miss_hidden++;
+ return -EAGAIN;
+ }
+ }
+
+ entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name);
+ if (entry == NULL || only_unplug) {
+ ll_sai_unplug(sai, entry);
+ return entry ? 1 : -EAGAIN;
+ }
+
+ if (!ll_sa_entry_stated(entry)) {
+ sai->sai_index_wait = entry->se_index;
+ lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL,
+ LWI_ON_SIGNAL_NOOP, NULL);
+ rc = l_wait_event(sai->sai_waitq,
+ ll_sa_entry_stated(entry) ||
+ thread_is_stopped(thread),
+ &lwi);
+ if (rc < 0) {
+ ll_sai_unplug(sai, entry);
+ return -EAGAIN;
+ }
+ }
+
+ if (entry->se_stat == SA_ENTRY_SUCC &&
+ entry->se_inode != NULL) {
+ struct inode *inode = entry->se_inode;
+ struct lookup_intent it = { .it_op = IT_GETATTR,
+ .d.lustre.it_lock_handle =
+ entry->se_handle };
+ __u64 bits;
+
+ rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+ ll_inode2fid(inode), &bits);
+ if (rc == 1) {
+ if (d_inode(*dentryp) == NULL) {
+ struct dentry *alias;
+
+ alias = ll_splice_alias(inode,
+ *dentryp);
+ if (IS_ERR(alias)) {
+ ll_sai_unplug(sai, entry);
+ return PTR_ERR(alias);
+ }
+ *dentryp = alias;
+ } else if (d_inode(*dentryp) != inode) {
+ /* revalidate, but inode is recreated */
+ CDEBUG(D_READA,
+ "stale dentry %pd inode %lu/%u, statahead inode %lu/%u\n",
+ *dentryp,
+ d_inode(*dentryp)->i_ino,
+ d_inode(*dentryp)->i_generation,
+ inode->i_ino,
+ inode->i_generation);
+ ll_sai_unplug(sai, entry);
+ return -ESTALE;
+ } else {
+ iput(inode);
+ }
+ entry->se_inode = NULL;
+
+ if ((bits & MDS_INODELOCK_LOOKUP) &&
+ d_lustre_invalid(*dentryp))
+ d_lustre_revalidate(*dentryp);
+ ll_intent_release(&it);
+ }
+ }
+
+ ll_sai_unplug(sai, entry);
+ return rc;
+ }
+
+ /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+ rc = is_first_dirent(dir, *dentryp);
+ if (rc == LS_NONE_FIRST_DE) {
+ /* It is not "ls -{a}l" operation, no need statahead for it. */
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ sai = ll_sai_alloc();
+ if (sai == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);
+ sai->sai_inode = igrab(dir);
+ if (unlikely(sai->sai_inode == NULL)) {
+ CWARN("Do not start stat ahead on dying inode "DFID"\n",
+ PFID(&lli->lli_fid));
+ rc = -ESTALE;
+ goto out;
+ }
+
+ /* get parent reference count here, and put it in ll_statahead_thread */
+ parent = dget((*dentryp)->d_parent);
+ if (unlikely(sai->sai_inode != d_inode(parent))) {
+ struct ll_inode_info *nlli = ll_i2info(d_inode(parent));
+
+ CWARN("Race condition, someone changed %pd just now: old parent "DFID", new parent "DFID"\n",
+ *dentryp,
+ PFID(&lli->lli_fid), PFID(&nlli->lli_fid));
+ dput(parent);
+ iput(sai->sai_inode);
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ CDEBUG(D_READA, "start statahead thread: sai %p, parent %pd\n",
+ sai, parent);
+
+ /* The sai buffer already has one reference taken at allocation time,
+ * but as soon as we expose the sai by attaching it to the lli that
+ * default reference can be dropped by another thread calling
+ * ll_stop_statahead. We need to take a local reference to protect
+ * the sai buffer while we intend to access it. */
+ ll_sai_get(sai);
+ lli->lli_sai = sai;
+
+ plli = ll_i2info(d_inode(parent));
+ rc = PTR_ERR(kthread_run(ll_statahead_thread, parent,
+ "ll_sa_%u", plli->lli_opendir_pid));
+ thread = &sai->sai_thread;
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("can't start ll_sa thread, rc: %d\n", rc);
+ dput(parent);
+ lli->lli_opendir_key = NULL;
+ thread_set_flags(thread, SVC_STOPPED);
+ thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+ /* Drop both our own local reference and the default
+ * reference from allocation time. */
+ ll_sai_put(sai);
+ ll_sai_put(sai);
+ LASSERT(lli->lli_sai == NULL);
+ return -EAGAIN;
+ }
+
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_running(thread) || thread_is_stopped(thread),
+ &lwi);
+ ll_sai_put(sai);
+
+ /*
+ * We don't stat-ahead for the first dirent since we are already in
+ * lookup.
+ */
+ return -EAGAIN;
+
+out:
+ if (sai != NULL)
+ OBD_FREE_PTR(sai);
+ spin_lock(&lli->lli_sa_lock);
+ lli->lli_opendir_key = NULL;
+ lli->lli_opendir_pid = 0;
+ spin_unlock(&lli->lli_sa_lock);
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
new file mode 100644
index 000000000..a494f6271
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/super25.c
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include "../include/lustre_lite.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_dlm.h"
+#include <linux/init.h>
+#include <linux/fs.h>
+#include "../include/lprocfs_status.h"
+#include "llite_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+ struct ll_inode_info *lli;
+ ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1);
+ OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, GFP_NOFS);
+ if (lli == NULL)
+ return NULL;
+
+ inode_init_once(&lli->lli_vfs_inode);
+ return &lli->lli_vfs_inode;
+}
+
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct ll_inode_info *ptr = ll_i2info(inode);
+ OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+
+/* exported operations */
+struct super_operations lustre_super_operations = {
+ .alloc_inode = ll_alloc_inode,
+ .destroy_inode = ll_destroy_inode,
+ .evict_inode = ll_delete_inode,
+ .put_super = ll_put_super,
+ .statfs = ll_statfs,
+ .umount_begin = ll_umount_begin,
+ .remount_fs = ll_remount_fs,
+ .show_options = ll_show_options,
+};
+MODULE_ALIAS_FS("lustre");
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
+static int __init init_lustre_lite(void)
+{
+ struct proc_dir_entry *entry;
+ lnet_process_id_t lnet_id;
+ struct timeval tv;
+ int i, rc, seed[2];
+
+ CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+
+ /* print an address of _any_ initialized kernel symbol from this
+ * module, to allow debugging with gdb that doesn't support data
+ * symbols from modules.*/
+ CDEBUG(D_INFO, "Lustre client module (%p).\n",
+ &lustre_super_operations);
+
+ rc = -ENOMEM;
+ ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+ sizeof(struct ll_inode_info),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (ll_inode_cachep == NULL)
+ goto out_cache;
+
+ ll_file_data_slab = kmem_cache_create("ll_file_data",
+ sizeof(struct ll_file_data), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (ll_file_data_slab == NULL)
+ goto out_cache;
+
+ ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache",
+ sizeof(struct ll_remote_perm),
+ 0, 0, NULL);
+ if (ll_remote_perm_cachep == NULL)
+ goto out_cache;
+
+ ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache",
+ REMOTE_PERM_HASHSIZE *
+ sizeof(struct list_head),
+ 0, 0, NULL);
+ if (ll_rmtperm_hash_cachep == NULL)
+ goto out_cache;
+
+ entry = lprocfs_register("llite", proc_lustre_root, NULL, NULL);
+ if (IS_ERR(entry)) {
+ rc = PTR_ERR(entry);
+ CERROR("cannot register '/proc/fs/lustre/llite': rc = %d\n",
+ rc);
+ goto out_cache;
+ }
+
+ proc_lustre_fs_root = entry;
+
+ cfs_get_random_bytes(seed, sizeof(seed));
+
+ /* Nodes with small feet have little entropy. The NID for this
+ * node gives the most entropy in the low bits */
+ for (i = 0;; i++) {
+ if (LNetGetId(i, &lnet_id) == -ENOENT)
+ break;
+
+ if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND)
+ seed[0] ^= LNET_NIDADDR(lnet_id.nid);
+ }
+
+ do_gettimeofday(&tv);
+ cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+ setup_timer(&ll_capa_timer, ll_capa_timer_callback, 0);
+ rc = ll_capa_thread_start();
+ if (rc != 0)
+ goto out_proc;
+
+ rc = vvp_global_init();
+ if (rc != 0)
+ goto out_capa;
+
+ rc = ll_xattr_init();
+ if (rc != 0)
+ goto out_vvp;
+
+ lustre_register_client_fill_super(ll_fill_super);
+ lustre_register_kill_super_cb(ll_kill_super);
+ lustre_register_client_process_config(ll_process_config);
+
+ return 0;
+
+out_vvp:
+ vvp_global_fini();
+out_capa:
+ del_timer(&ll_capa_timer);
+ ll_capa_thread_stop();
+out_proc:
+ lprocfs_remove(&proc_lustre_fs_root);
+out_cache:
+ if (ll_inode_cachep != NULL)
+ kmem_cache_destroy(ll_inode_cachep);
+
+ if (ll_file_data_slab != NULL)
+ kmem_cache_destroy(ll_file_data_slab);
+
+ if (ll_remote_perm_cachep != NULL)
+ kmem_cache_destroy(ll_remote_perm_cachep);
+
+ if (ll_rmtperm_hash_cachep != NULL)
+ kmem_cache_destroy(ll_rmtperm_hash_cachep);
+
+ return rc;
+}
+
+static void __exit exit_lustre_lite(void)
+{
+ lustre_register_client_fill_super(NULL);
+ lustre_register_kill_super_cb(NULL);
+ lustre_register_client_process_config(NULL);
+
+ lprocfs_remove(&proc_lustre_fs_root);
+
+ ll_xattr_fini();
+ vvp_global_fini();
+ del_timer(&ll_capa_timer);
+ ll_capa_thread_stop();
+ LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0,
+ "client remaining capa count %d\n",
+ capa_count[CAPA_SITE_CLIENT]);
+
+ kmem_cache_destroy(ll_inode_cachep);
+ kmem_cache_destroy(ll_rmtperm_hash_cachep);
+
+ kmem_cache_destroy(ll_remote_perm_cachep);
+
+ kmem_cache_destroy(ll_file_data_slab);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Lite Client File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_lustre_lite);
+module_exit(exit_lustre_lite);
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
new file mode 100644
index 000000000..3711e671a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+
+static int ll_readlink_internal(struct inode *inode,
+ struct ptlrpc_request **request, char **symname)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ int rc, symlen = i_size_read(inode) + 1;
+ struct mdt_body *body;
+ struct md_op_data *op_data;
+
+ *request = NULL;
+
+ if (lli->lli_symlink_name) {
+ int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+ *symname = lli->lli_symlink_name;
+ /* If the total CDEBUG() size is larger than a page, it
+ * will print a warning to the console, avoid this by
+ * printing just the last part of the symlink. */
+ CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+ print_limit < symlen ? "..." : "", print_limit,
+ (*symname) + symlen - print_limit, symlen);
+ return 0;
+ }
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ return PTR_ERR(op_data);
+
+ op_data->op_valid = OBD_MD_LINKNAME;
+ rc = md_getattr(sbi->ll_md_exp, op_data, request);
+ ll_finish_md_op_data(op_data);
+ if (rc) {
+ if (rc != -ENOENT)
+ CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
+ goto failed;
+ }
+
+ body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body != NULL);
+ if ((body->valid & OBD_MD_LINKNAME) == 0) {
+ CERROR("OBD_MD_LINKNAME not set on reply\n");
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ LASSERT(symlen != 0);
+ if (body->eadatasize != symlen) {
+ CERROR("inode %lu: symlink length %d not expected %d\n",
+ inode->i_ino, body->eadatasize - 1, symlen - 1);
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+ if (*symname == NULL ||
+ strnlen(*symname, symlen) != symlen - 1) {
+ /* not full/NULL terminated */
+ CERROR("inode %lu: symlink not NULL terminated string of length %d\n",
+ inode->i_ino, symlen - 1);
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ lli->lli_symlink_name = kzalloc(symlen, GFP_NOFS);
+ /* do not return an error if we cannot cache the symlink locally */
+ if (lli->lli_symlink_name) {
+ memcpy(lli->lli_symlink_name, *symname, symlen);
+ *symname = lli->lli_symlink_name;
+ }
+ return 0;
+
+failed:
+ return rc;
+}
+
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ptlrpc_request *request = NULL;
+ int rc;
+ char *symname = NULL;
+
+ CDEBUG(D_VFSTRACE, "VFS Op\n");
+ /* Limit the recursive symlink depth to 5 instead of default
+ * 8 links when kernel has 4k stack to prevent stack overflow.
+ * For 8k stacks we need to limit it to 7 for local servers. */
+ if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+ rc = -ELOOP;
+ } else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+ rc = -ELOOP;
+ } else {
+ ll_inode_size_lock(inode);
+ rc = ll_readlink_internal(inode, &request, &symname);
+ ll_inode_size_unlock(inode);
+ }
+ if (rc) {
+ ptlrpc_req_finished(request);
+ request = NULL;
+ symname = ERR_PTR(rc);
+ }
+
+ nd_set_link(nd, symname);
+ /* symname may contain a pointer to the request message buffer,
+ * we delay request releasing until ll_put_link then.
+ */
+ return request;
+}
+
+static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+ ptlrpc_req_finished(cookie);
+}
+
+struct inode_operations ll_fast_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .setattr = ll_setattr,
+ .follow_link = ll_follow_link,
+ .put_link = ll_put_link,
+ .getattr = ll_getattr,
+ .permission = ll_inode_permission,
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c
new file mode 100644
index 000000000..fde41d7c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_dev.c
@@ -0,0 +1,547 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+#include "llite_internal.h"
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+static struct kmem_cache *vvp_thread_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct lu_kmem_descr vvp_caches[] = {
+ {
+ .ckd_cache = &vvp_thread_kmem,
+ .ckd_name = "vvp_thread_kmem",
+ .ckd_size = sizeof(struct vvp_thread_info),
+ },
+ {
+ .ckd_cache = &vvp_session_kmem,
+ .ckd_name = "vvp_session_kmem",
+ .ckd_size = sizeof(struct vvp_session)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+static void *vvp_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct vvp_thread_info *info;
+
+ OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, GFP_NOFS);
+ if (info == NULL)
+ info = ERR_PTR(-ENOMEM);
+ return info;
+}
+
+static void vvp_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct vvp_thread_info *info = data;
+
+ OBD_SLAB_FREE_PTR(info, vvp_thread_kmem);
+}
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct vvp_session *session;
+
+ OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, GFP_NOFS);
+ if (session == NULL)
+ session = ERR_PTR(-ENOMEM);
+ return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct vvp_session *session = data;
+
+ OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+
+struct lu_context_key vvp_key = {
+ .lct_tags = LCT_CL_THREAD,
+ .lct_init = vvp_key_init,
+ .lct_fini = vvp_key_fini
+};
+
+struct lu_context_key vvp_session_key = {
+ .lct_tags = LCT_SESSION,
+ .lct_init = vvp_session_key_init,
+ .lct_fini = vvp_session_key_fini
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+ .ldo_object_alloc = vvp_object_alloc
+};
+
+static const struct cl_device_operations vvp_cl_ops = {
+ .cdo_req_init = ccc_req_init
+};
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg)
+{
+ return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+ .ldto_init = vvp_type_init,
+ .ldto_fini = vvp_type_fini,
+
+ .ldto_start = vvp_type_start,
+ .ldto_stop = vvp_type_stop,
+
+ .ldto_device_alloc = vvp_device_alloc,
+ .ldto_device_free = ccc_device_free,
+ .ldto_device_init = ccc_device_init,
+ .ldto_device_fini = ccc_device_fini
+};
+
+struct lu_device_type vvp_device_type = {
+ .ldt_tags = LU_DEVICE_CL,
+ .ldt_name = LUSTRE_VVP_NAME,
+ .ldt_ops = &vvp_device_type_ops,
+ .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+ int result;
+
+ result = lu_kmem_init(vvp_caches);
+ if (result == 0) {
+ result = ccc_global_init(&vvp_device_type);
+ if (result != 0)
+ lu_kmem_fini(vvp_caches);
+ }
+ return result;
+}
+
+void vvp_global_fini(void)
+{
+ ccc_global_fini(&vvp_device_type);
+ lu_kmem_fini(vvp_caches);
+}
+
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+ struct ll_sb_info *sbi;
+ struct cl_device *cl;
+ struct lu_env *env;
+ int rc = 0;
+ int refcheck;
+
+ sbi = ll_s2sbi(sb);
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ cl = cl_type_setup(env, NULL, &vvp_device_type,
+ sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+ if (!IS_ERR(cl)) {
+ cl2ccc_dev(cl)->cdv_sb = sb;
+ sbi->ll_cl = cl;
+ sbi->ll_site = cl2lu_dev(cl)->ld_site;
+ }
+ cl_env_put(env, &refcheck);
+ } else
+ rc = PTR_ERR(env);
+ return rc;
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+ struct ll_sb_info *sbi;
+ struct lu_env *env;
+ struct cl_device *cld;
+ int refcheck;
+ int result;
+
+ sbi = ll_s2sbi(sb);
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ cld = sbi->ll_cl;
+
+ if (cld != NULL) {
+ cl_stack_fini(env, cld);
+ sbi->ll_cl = NULL;
+ sbi->ll_site = NULL;
+ }
+ cl_env_put(env, &refcheck);
+ result = 0;
+ } else {
+ CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+ result = PTR_ERR(env);
+ }
+ /*
+ * If mount failed (sbi->ll_cl == NULL), and this there are no other
+ * mounts, stop device types manually (this usually happens
+ * automatically when last device is destroyed).
+ */
+ lu_types_stop();
+ return result;
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ * - file hash bucket in lu_site::ls_hash[] 28bits
+ *
+ * - how far file is from bucket head 4bits
+ *
+ * - page index 32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+ unsigned vpi_bucket;
+ unsigned vpi_depth;
+ uint32_t vpi_index;
+
+ unsigned vpi_curdep;
+ struct lu_object_header *vpi_obj;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+ CLASSERT(sizeof(pos) == sizeof(__u64));
+
+ id->vpi_index = pos & 0xffffffff;
+ id->vpi_depth = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+ id->vpi_bucket = (unsigned long long)pos >> PGC_OBJ_SHIFT;
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+ return
+ ((__u64)id->vpi_index) |
+ ((__u64)id->vpi_depth << PGC_DEPTH_SHIFT) |
+ ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *data)
+{
+ struct vvp_pgcache_id *id = data;
+ struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
+
+ if (id->vpi_curdep-- > 0)
+ return 0; /* continue */
+
+ if (lu_object_is_dying(hdr))
+ return 1;
+
+ cfs_hash_get(hs, hnode);
+ id->vpi_obj = hdr;
+ return 1;
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+ struct lu_device *dev,
+ struct vvp_pgcache_id *id)
+{
+ LASSERT(lu_device_is_cl(dev));
+
+ id->vpi_depth &= 0xf;
+ id->vpi_obj = NULL;
+ id->vpi_curdep = id->vpi_depth;
+
+ cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
+ vvp_pgcache_obj_get, id);
+ if (id->vpi_obj != NULL) {
+ struct lu_object *lu_obj;
+
+ lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type);
+ if (lu_obj != NULL) {
+ lu_object_ref_add(lu_obj, "dump", current);
+ return lu2cl(lu_obj);
+ }
+ lu_object_put(env, lu_object_top(id->vpi_obj));
+
+ } else if (id->vpi_curdep > 0) {
+ id->vpi_depth = 0xf;
+ }
+ return NULL;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+ struct lu_device *dev, loff_t pos)
+{
+ struct cl_object *clob;
+ struct lu_site *site;
+ struct vvp_pgcache_id id;
+
+ site = dev->ld_site;
+ vvp_pgcache_id_unpack(pos, &id);
+
+ while (1) {
+ if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+ return ~0ULL;
+ clob = vvp_pgcache_obj(env, dev, &id);
+ if (clob != NULL) {
+ struct cl_object_header *hdr;
+ int nr;
+ struct cl_page *pg;
+
+ /* got an object. Find next page. */
+ hdr = cl_object_header(clob);
+
+ spin_lock(&hdr->coh_page_guard);
+ nr = radix_tree_gang_lookup(&hdr->coh_tree,
+ (void **)&pg,
+ id.vpi_index, 1);
+ if (nr > 0) {
+ id.vpi_index = pg->cp_index;
+ /* Cant support over 16T file */
+ nr = !(pg->cp_index > 0xffffffff);
+ }
+ spin_unlock(&hdr->coh_page_guard);
+
+ lu_object_ref_del(&clob->co_lu, "dump", current);
+ cl_object_put(env, clob);
+ if (nr > 0)
+ return vvp_pgcache_id_pack(&id);
+ }
+ /* to the next object. */
+ ++id.vpi_depth;
+ id.vpi_depth &= 0xf;
+ if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+ return ~0ULL;
+ id.vpi_index = 0;
+ }
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do { \
+ if (test_bit(PG_##flag, &(page)->flags)) { \
+ seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \
+ has_flags = 1; \
+ } \
+} while (0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+ struct seq_file *seq, struct cl_page *page)
+{
+ struct ccc_page *cpg;
+ struct page *vmpage;
+ int has_flags;
+
+ cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+ vmpage = cpg->cpg_page;
+ seq_printf(seq, " %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [",
+ 0 /* gen */,
+ cpg, page,
+ "none",
+ cpg->cpg_write_queued ? "wq" : "- ",
+ cpg->cpg_defer_uptodate ? "du" : "- ",
+ PageWriteback(vmpage) ? "wb" : "-",
+ vmpage, vmpage->mapping->host->i_ino,
+ vmpage->mapping->host->i_generation,
+ vmpage->mapping->host, vmpage->index,
+ page_count(vmpage));
+ has_flags = 0;
+ seq_page_flag(seq, vmpage, locked, has_flags);
+ seq_page_flag(seq, vmpage, error, has_flags);
+ seq_page_flag(seq, vmpage, referenced, has_flags);
+ seq_page_flag(seq, vmpage, uptodate, has_flags);
+ seq_page_flag(seq, vmpage, dirty, has_flags);
+ seq_page_flag(seq, vmpage, writeback, has_flags);
+ seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+ loff_t pos;
+ struct ll_sb_info *sbi;
+ struct cl_object *clob;
+ struct lu_env *env;
+ struct cl_page *page;
+ struct cl_object_header *hdr;
+ struct vvp_pgcache_id id;
+ int refcheck;
+ int result;
+
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ pos = *(loff_t *) v;
+ vvp_pgcache_id_unpack(pos, &id);
+ sbi = f->private;
+ clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+ if (clob != NULL) {
+ hdr = cl_object_header(clob);
+
+ spin_lock(&hdr->coh_page_guard);
+ page = cl_page_lookup(hdr, id.vpi_index);
+ spin_unlock(&hdr->coh_page_guard);
+
+ seq_printf(f, "%8x@"DFID": ",
+ id.vpi_index, PFID(&hdr->coh_lu.loh_fid));
+ if (page != NULL) {
+ vvp_pgcache_page_show(env, f, page);
+ cl_page_put(env, page);
+ } else
+ seq_puts(f, "missing\n");
+ lu_object_ref_del(&clob->co_lu, "dump", current);
+ cl_object_put(env, clob);
+ } else
+ seq_printf(f, "%llx missing\n", pos);
+ cl_env_put(env, &refcheck);
+ result = 0;
+ } else
+ result = PTR_ERR(env);
+ return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+ struct ll_sb_info *sbi;
+ struct lu_env *env;
+ int refcheck;
+
+ sbi = f->private;
+
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ sbi = f->private;
+ if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+ pos = ERR_PTR(-EFBIG);
+ else {
+ *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+ *pos);
+ if (*pos == ~0ULL)
+ pos = NULL;
+ }
+ cl_env_put(env, &refcheck);
+ }
+ return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ struct ll_sb_info *sbi;
+ struct lu_env *env;
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ sbi = f->private;
+ *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+ if (*pos == ~0ULL)
+ pos = NULL;
+ cl_env_put(env, &refcheck);
+ }
+ return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+ .start = vvp_pgcache_start,
+ .next = vvp_pgcache_next,
+ .stop = vvp_pgcache_stop,
+ .show = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+ struct ll_sb_info *sbi = PDE_DATA(inode);
+ struct seq_file *seq;
+ int result;
+
+ result = seq_open(filp, &vvp_pgcache_ops);
+ if (result == 0) {
+ seq = filp->private_data;
+ seq->private = sbi;
+ }
+ return result;
+}
+
+const struct file_operations vvp_dump_pgcache_file_ops = {
+ .owner = THIS_MODULE,
+ .open = vvp_dump_pgcache_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h
new file mode 100644
index 000000000..2162bf6c0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+
+#include "../include/cl_object.h"
+#include "llite_internal.h"
+
+int vvp_io_init (const struct lu_env *env,
+ struct cl_object *obj, struct cl_io *io);
+int vvp_lock_init (const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *io);
+int vvp_page_init (const struct lu_env *env,
+ struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev);
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+extern const struct file_operations vvp_dump_pgcache_file_ops;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
new file mode 100644
index 000000000..91bba7967
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -0,0 +1,1209 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+
+/**
+ * True, if \a io is a normal io, False for splice_{read,write}
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io)
+{
+ struct vvp_io *vio = vvp_env_io(env);
+
+ LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+ return vio->cui_io_subtype == IO_NORMAL;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+ struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ccc_io *cio = ccc_env_io(env);
+ bool rc = true;
+
+ switch (io->ci_type) {
+ case CIT_READ:
+ case CIT_WRITE:
+ /* don't need lock here to check lli_layout_gen as we have held
+ * extent lock and GROUP lock has to hold to swap layout */
+ if (ll_layout_version_get(lli) != cio->cui_layout_gen) {
+ io->ci_need_restart = 1;
+ /* this will return application a short read/write */
+ io->ci_continue = 0;
+ rc = false;
+ }
+ case CIT_FAULT:
+ /* fault is okay because we've already had a page. */
+ default:
+ break;
+ }
+
+ return rc;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct inode *inode = ccc_object_inode(ios->cis_obj);
+
+ LASSERT(inode ==
+ file_inode(cl2ccc_io(env, ios)->cui_fd->fd_file));
+ vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime);
+ return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct ccc_io *cio = cl2ccc_io(env, ios);
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+ CDEBUG(D_VFSTRACE, DFID
+ " ignore/verify layout %d/%d, layout version %d restore needed %d\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ io->ci_ignore_layout, io->ci_verify_layout,
+ cio->cui_layout_gen, io->ci_restore_needed);
+
+ if (io->ci_restore_needed == 1) {
+ int rc;
+
+ /* file was detected release, we need to restore it
+ * before finishing the io
+ */
+ rc = ll_layout_restore(ccc_object_inode(obj));
+ /* if restore registration failed, no restart,
+ * we will return -ENODATA */
+ /* The layout will change after restore, so we need to
+ * block on layout lock hold by the MDT
+ * as MDT will not send new layout in lvb (see LU-3124)
+ * we have to explicitly fetch it, all this will be done
+ * by ll_layout_refresh()
+ */
+ if (rc == 0) {
+ io->ci_restore_needed = 0;
+ io->ci_need_restart = 1;
+ io->ci_verify_layout = 1;
+ } else {
+ io->ci_restore_needed = 1;
+ io->ci_need_restart = 0;
+ io->ci_verify_layout = 0;
+ io->ci_result = rc;
+ }
+ }
+
+ if (!io->ci_ignore_layout && io->ci_verify_layout) {
+ __u32 gen = 0;
+
+ /* check layout version */
+ ll_layout_refresh(ccc_object_inode(obj), &gen);
+ io->ci_need_restart = cio->cui_layout_gen != gen;
+ if (io->ci_need_restart) {
+ CDEBUG(D_VFSTRACE,
+ DFID" layout changed from %d to %d.\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ cio->cui_layout_gen, gen);
+ /* today successful restore is the only possible
+ * case */
+ /* restore was done, clear restoring state */
+ ll_i2info(ccc_object_inode(obj))->lli_flags &=
+ ~LLIF_FILE_RESTORING;
+ }
+ }
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ struct cl_page *page = io->u.ci_fault.ft_page;
+
+ CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+
+ if (page != NULL) {
+ lu_ref_del(&page->cp_reference, "fault", io);
+ cl_page_put(env, page);
+ io->u.ci_fault.ft_page = NULL;
+ }
+ vvp_io_fini(env, ios);
+}
+
+static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+ /*
+ * we only want to hold PW locks if the mmap() can generate
+ * writes back to the file and that only happens in shared
+ * writable vmas
+ */
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+ return CLM_WRITE;
+ return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+ struct ccc_io *vio, struct cl_io *io)
+{
+ struct ccc_thread_info *cti = ccc_env_info(env);
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct cl_lock_descr *descr = &cti->cti_descr;
+ ldlm_policy_data_t policy;
+ unsigned long addr;
+ ssize_t count;
+ int result;
+ struct iov_iter i;
+ struct iovec iov;
+
+ LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+ if (!cl_is_normalio(env, io))
+ return 0;
+
+ if (vio->cui_iter == NULL) /* nfs or loop back device write */
+ return 0;
+
+ /* No MM (e.g. NFS)? No vmas too. */
+ if (mm == NULL)
+ return 0;
+
+ iov_for_each(iov, i, *(vio->cui_iter)) {
+ addr = (unsigned long)iov.iov_base;
+ count = iov.iov_len;
+ if (count == 0)
+ continue;
+
+ count += addr & (~CFS_PAGE_MASK);
+ addr &= CFS_PAGE_MASK;
+
+ down_read(&mm->mmap_sem);
+ while ((vma = our_vma(mm, addr, count)) != NULL) {
+ struct inode *inode = file_inode(vma->vm_file);
+ int flags = CEF_MUST;
+
+ if (ll_file_nolock(vma->vm_file)) {
+ /*
+ * For no lock case, a lockless lock will be
+ * generated.
+ */
+ flags = CEF_NEVER;
+ }
+
+ /*
+ * XXX: Required lock mode can be weakened: CIT_WRITE
+ * io only ever reads user level buffer, and CIT_READ
+ * only writes on it.
+ */
+ policy_from_vma(&policy, vma, addr, count);
+ descr->cld_mode = vvp_mode_from_vma(vma);
+ descr->cld_obj = ll_i2info(inode)->lli_clob;
+ descr->cld_start = cl_index(descr->cld_obj,
+ policy.l_extent.start);
+ descr->cld_end = cl_index(descr->cld_obj,
+ policy.l_extent.end);
+ descr->cld_enq_flags = flags;
+ result = cl_io_lock_alloc_add(env, io, descr);
+
+ CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+ descr->cld_mode, descr->cld_start,
+ descr->cld_end);
+
+ if (result < 0) {
+ up_read(&mm->mmap_sem);
+ return result;
+ }
+
+ if (vma->vm_end - addr >= count)
+ break;
+
+ count -= vma->vm_end - addr;
+ addr = vma->vm_end;
+ }
+ up_read(&mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+ enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+ struct ccc_io *cio = ccc_env_io(env);
+ int result;
+ int ast_flags = 0;
+
+ LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+ ccc_io_update_iov(env, cio, io);
+
+ if (io->u.ci_rw.crw_nonblock)
+ ast_flags |= CEF_NONBLOCK;
+ result = vvp_mmap_locks(env, cio, io);
+ if (result == 0)
+ result = ccc_io_one_lock(env, io, ast_flags, mode, start, end);
+ return result;
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ struct cl_io_rw_common *rd = &io->u.ci_rd.rd;
+ int result;
+
+ result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos,
+ rd->crw_pos + rd->crw_count - 1);
+
+ return result;
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ /*
+ * XXX LDLM_FL_CBPENDING
+ */
+ return ccc_io_one_lock_index
+ (env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma),
+ io->u.ci_fault.ft_index, io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ loff_t start;
+ loff_t end;
+
+ if (io->u.ci_wr.wr_append) {
+ start = 0;
+ end = OBD_OBJECT_EOF;
+ } else {
+ start = io->u.ci_wr.wr.crw_pos;
+ end = start + io->u.ci_wr.wr.crw_count - 1;
+ }
+ return vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct ccc_io *cio = ccc_env_io(env);
+ struct cl_io *io = ios->cis_io;
+ __u64 new_size;
+ __u32 enqflags = 0;
+
+ if (cl_io_is_trunc(io)) {
+ new_size = io->u.ci_setattr.sa_attr.lvb_size;
+ if (new_size == 0)
+ enqflags = CEF_DISCARD_DATA;
+ } else {
+ if ((io->u.ci_setattr.sa_attr.lvb_mtime >=
+ io->u.ci_setattr.sa_attr.lvb_ctime) ||
+ (io->u.ci_setattr.sa_attr.lvb_atime >=
+ io->u.ci_setattr.sa_attr.lvb_ctime))
+ return 0;
+ new_size = 0;
+ }
+ cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK;
+ return ccc_io_one_lock(env, io, enqflags, CLM_WRITE,
+ new_size, OBD_OBJECT_EOF);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+ int result;
+ /*
+ * Only ll_inode_size_lock is taken at this level.
+ */
+ ll_inode_size_lock(inode);
+ result = inode_newsize_ok(inode, size);
+ if (result < 0) {
+ ll_inode_size_unlock(inode);
+ return result;
+ }
+ truncate_setsize(inode, size);
+ ll_inode_size_unlock(inode);
+ return result;
+}
+
+static int vvp_io_setattr_trunc(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ struct inode *inode, loff_t size)
+{
+ inode_dio_wait(inode);
+ return 0;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct cl_attr *attr = ccc_env_thread_attr(env);
+ int result;
+ unsigned valid = CAT_CTIME;
+
+ cl_object_attr_lock(obj);
+ attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+ if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+ attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+ valid |= CAT_ATIME;
+ }
+ if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+ attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+ valid |= CAT_MTIME;
+ }
+ result = cl_object_attr_set(env, obj, attr, valid);
+ cl_object_attr_unlock(obj);
+
+ return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ struct inode *inode = ccc_object_inode(io->ci_obj);
+ int result = 0;
+
+ mutex_lock(&inode->i_mutex);
+ if (cl_io_is_trunc(io))
+ result = vvp_io_setattr_trunc(env, ios, inode,
+ io->u.ci_setattr.sa_attr.lvb_size);
+ if (result == 0)
+ result = vvp_io_setattr_time(env, ios);
+ return result;
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io = ios->cis_io;
+ struct inode *inode = ccc_object_inode(io->ci_obj);
+
+ if (cl_io_is_trunc(io)) {
+ /* Truncate in memory pages - they must be clean pages
+ * because osc has already notified to destroy osc_extents. */
+ vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+ inode_dio_write_done(inode);
+ }
+ mutex_unlock(&inode->i_mutex);
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ vvp_io_fini(env, ios);
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct ccc_io *cio = cl2ccc_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct inode *inode = ccc_object_inode(obj);
+ struct ll_ra_read *bead = &vio->cui_bead;
+ struct file *file = cio->cui_fd->fd_file;
+
+ int result;
+ loff_t pos = io->u.ci_rd.rd.crw_pos;
+ long cnt = io->u.ci_rd.rd.crw_count;
+ long tot = cio->cui_tot_count;
+ int exceed = 0;
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+ CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+ if (!can_populate_pages(env, io, inode))
+ return 0;
+
+ result = ccc_prep_size(env, obj, io, pos, tot, &exceed);
+ if (result != 0)
+ return result;
+ else if (exceed != 0)
+ goto out;
+
+ LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+ "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+ inode->i_ino, cnt, pos, i_size_read(inode));
+
+ /* turn off the kernel's read-ahead */
+ cio->cui_fd->fd_file->f_ra.ra_pages = 0;
+
+ /* initialize read-ahead window once per syscall */
+ if (!vio->cui_ra_window_set) {
+ vio->cui_ra_window_set = 1;
+ bead->lrr_start = cl_index(obj, pos);
+ /*
+ * XXX: explicit PAGE_CACHE_SIZE
+ */
+ bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
+ ll_ra_read_in(file, bead);
+ }
+
+ /* BUG: 5972 */
+ file_accessed(file);
+ switch (vio->cui_io_subtype) {
+ case IO_NORMAL:
+ LASSERT(cio->cui_iocb->ki_pos == pos);
+ result = generic_file_read_iter(cio->cui_iocb, cio->cui_iter);
+ break;
+ case IO_SPLICE:
+ result = generic_file_splice_read(file, &pos,
+ vio->u.splice.cui_pipe, cnt,
+ vio->u.splice.cui_flags);
+ /* LU-1109: do splice read stripe by stripe otherwise if it
+ * may make nfsd stuck if this read occupied all internal pipe
+ * buffers. */
+ io->ci_continue = 0;
+ break;
+ default:
+ CERROR("Wrong IO type %u\n", vio->cui_io_subtype);
+ LBUG();
+ }
+
+out:
+ if (result >= 0) {
+ if (result < cnt)
+ io->ci_continue = 0;
+ io->ci_nob += result;
+ ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+ cio->cui_fd, pos, result, READ);
+ result = 0;
+ }
+ return result;
+}
+
+static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct ccc_io *cio = cl2ccc_io(env, ios);
+
+ if (vio->cui_ra_window_set)
+ ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead);
+
+ vvp_io_fini(env, ios);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct ccc_io *cio = cl2ccc_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct inode *inode = ccc_object_inode(obj);
+ ssize_t result = 0;
+ loff_t pos = io->u.ci_wr.wr.crw_pos;
+ size_t cnt = io->u.ci_wr.wr.crw_count;
+
+ if (!can_populate_pages(env, io, inode))
+ return 0;
+
+ if (cl_io_is_append(io)) {
+ /*
+ * PARALLEL IO This has to be changed for parallel IO doing
+ * out-of-order writes.
+ */
+ pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+ cio->cui_iocb->ki_pos = pos;
+ } else {
+ LASSERT(cio->cui_iocb->ki_pos == pos);
+ }
+
+ CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
+
+ if (cio->cui_iter == NULL) /* from a temp io in ll_cl_init(). */
+ result = 0;
+ else
+ result = generic_file_write_iter(cio->cui_iocb, cio->cui_iter);
+
+ if (result > 0) {
+ if (result < cnt)
+ io->ci_continue = 0;
+ io->ci_nob += result;
+ ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+ cio->cui_fd, pos, result, WRITE);
+ result = 0;
+ }
+ return result;
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+ struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+ cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+ cfio->fault.ft_flags_valid = 1;
+
+ if (vmf->page) {
+ CDEBUG(D_PAGE,
+ "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n",
+ vmf->page, vmf->page->mapping, vmf->page->index,
+ (long)vmf->page->flags, page_count(vmf->page),
+ page_private(vmf->page), vmf->virtual_address);
+ if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+ lock_page(vmf->page);
+ cfio->fault.ft_flags |= VM_FAULT_LOCKED;
+ }
+
+ cfio->ft_vmpage = vmf->page;
+ return 0;
+ }
+
+ if (cfio->fault.ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
+ CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+ return -EFAULT;
+ }
+
+ if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+ CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+ return -ENOMEM;
+ }
+
+ if (cfio->fault.ft_flags & VM_FAULT_RETRY)
+ return -EAGAIN;
+
+ CERROR("Unknown error in page fault %d!\n", cfio->fault.ft_flags);
+ return -EINVAL;
+}
+
+
+static int vvp_io_fault_start(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct inode *inode = ccc_object_inode(obj);
+ struct cl_fault_io *fio = &io->u.ci_fault;
+ struct vvp_fault_io *cfio = &vio->u.fault;
+ loff_t offset;
+ int result = 0;
+ struct page *vmpage = NULL;
+ struct cl_page *page;
+ loff_t size;
+ pgoff_t last; /* last page in a file data region */
+
+ if (fio->ft_executable &&
+ LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+ CWARN("binary "DFID
+ " changed while waiting for the page fault lock\n",
+ PFID(lu_object_fid(&obj->co_lu)));
+
+ /* offset of the last byte on the page */
+ offset = cl_offset(obj, fio->ft_index + 1) - 1;
+ LASSERT(cl_index(obj, offset) == fio->ft_index);
+ result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL);
+ if (result != 0)
+ return result;
+
+ /* must return locked page */
+ if (fio->ft_mkwrite) {
+ LASSERT(cfio->ft_vmpage != NULL);
+ lock_page(cfio->ft_vmpage);
+ } else {
+ result = vvp_io_kernel_fault(cfio);
+ if (result != 0)
+ return result;
+ }
+
+ vmpage = cfio->ft_vmpage;
+ LASSERT(PageLocked(vmpage));
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+ ll_invalidate_page(vmpage);
+
+ size = i_size_read(inode);
+ /* Though we have already held a cl_lock upon this page, but
+ * it still can be truncated locally. */
+ if (unlikely((vmpage->mapping != inode->i_mapping) ||
+ (page_offset(vmpage) > size))) {
+ CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+ /* return +1 to stop cl_io_loop() and ll_fault() will catch
+ * and retry. */
+ result = +1;
+ goto out;
+ }
+
+
+ if (fio->ft_mkwrite) {
+ pgoff_t last_index;
+ /*
+ * Capture the size while holding the lli_trunc_sem from above
+ * we want to make sure that we complete the mkwrite action
+ * while holding this lock. We need to make sure that we are
+ * not past the end of the file.
+ */
+ last_index = cl_index(obj, size - 1);
+ if (last_index < fio->ft_index) {
+ CDEBUG(D_PAGE,
+ "llite: mkwrite and truncate race happened: %p: 0x%lx 0x%lx\n",
+ vmpage->mapping, fio->ft_index, last_index);
+ /*
+ * We need to return if we are
+ * passed the end of the file. This will propagate
+ * up the call stack to ll_page_mkwrite where
+ * we will return VM_FAULT_NOPAGE. Any non-negative
+ * value returned here will be silently
+ * converted to 0. If the vmpage->mapping is null
+ * the error code would be converted back to ENODATA
+ * in ll_page_mkwrite0. Thus we return -ENODATA
+ * to handle both cases
+ */
+ result = -ENODATA;
+ goto out;
+ }
+ }
+
+ page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+ if (IS_ERR(page)) {
+ result = PTR_ERR(page);
+ goto out;
+ }
+
+ /* if page is going to be written, we should add this page into cache
+ * earlier. */
+ if (fio->ft_mkwrite) {
+ wait_on_page_writeback(vmpage);
+ if (set_page_dirty(vmpage)) {
+ struct ccc_page *cp;
+
+ /* vvp_page_assume() calls wait_on_page_writeback(). */
+ cl_page_assume(env, io, page);
+
+ cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+ vvp_write_pending(cl2ccc(obj), cp);
+
+ /* Do not set Dirty bit here so that in case IO is
+ * started before the page is really made dirty, we
+ * still have chance to detect it. */
+ result = cl_page_cache_add(env, io, page, CRT_WRITE);
+ LASSERT(cl_page_is_owned(page, io));
+
+ vmpage = NULL;
+ if (result < 0) {
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+
+ cl_page_put(env, page);
+
+ /* we're in big trouble, what can we do now? */
+ if (result == -EDQUOT)
+ result = -ENOSPC;
+ goto out;
+ } else
+ cl_page_disown(env, io, page);
+ }
+ }
+
+ last = cl_index(obj, size - 1);
+ /*
+ * The ft_index is only used in the case of
+ * a mkwrite action. We need to check
+ * our assertions are correct, since
+ * we should have caught this above
+ */
+ LASSERT(!fio->ft_mkwrite || fio->ft_index <= last);
+ if (fio->ft_index == last)
+ /*
+ * Last page is mapped partially.
+ */
+ fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+ else
+ fio->ft_nob = cl_page_size(obj);
+
+ lu_ref_add(&page->cp_reference, "fault", io);
+ fio->ft_page = page;
+
+out:
+ /* return unlocked vmpage to avoid deadlocking */
+ if (vmpage != NULL)
+ unlock_page(vmpage);
+ cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+ return result;
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ /* we should mark TOWRITE bit to each dirty page in radix tree to
+ * verify pages have been written, but this is difficult because of
+ * race. */
+ return 0;
+}
+
+static int vvp_io_read_page(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ const struct cl_page_slice *slice)
+{
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = slice->cpl_obj;
+ struct ccc_page *cp = cl2ccc_page(slice);
+ struct cl_page *page = slice->cpl_page;
+ struct inode *inode = ccc_object_inode(obj);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_file_data *fd = cl2ccc_io(env, ios)->cui_fd;
+ struct ll_readahead_state *ras = &fd->fd_ras;
+ struct page *vmpage = cp->cpg_page;
+ struct cl_2queue *queue = &io->ci_queue;
+ int rc;
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+ LASSERT(slice->cpl_obj == obj);
+
+ if (sbi->ll_ra_info.ra_max_pages_per_file &&
+ sbi->ll_ra_info.ra_max_pages)
+ ras_update(sbi, inode, ras, page->cp_index,
+ cp->cpg_defer_uptodate);
+
+ /* Sanity check whether the page is protected by a lock. */
+ rc = cl_page_is_under_lock(env, io, page);
+ if (rc != -EBUSY) {
+ CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n",
+ rc == -ENODATA ? "without a lock" :
+ "match failed", rc);
+ if (rc != -ENODATA)
+ return rc;
+ }
+
+ if (cp->cpg_defer_uptodate) {
+ cp->cpg_ra_used = 1;
+ cl_page_export(env, page, 1);
+ }
+ /*
+ * Add page into the queue even when it is marked uptodate above.
+ * this will unlock it automatically as part of cl_page_list_disown().
+ */
+ cl_2queue_add(queue, page);
+ if (sbi->ll_ra_info.ra_max_pages_per_file &&
+ sbi->ll_ra_info.ra_max_pages)
+ ll_readahead(env, io, ras,
+ vmpage->mapping, &queue->c2_qin, fd->fd_flags);
+
+ return 0;
+}
+
+static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, struct ccc_page *cp,
+ enum cl_req_type crt)
+{
+ struct cl_2queue *queue;
+ int result;
+
+ LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+ queue = &io->ci_queue;
+ cl_2queue_init_page(queue, page);
+
+ result = cl_io_submit_sync(env, io, crt, queue, 0);
+ LASSERT(cl_page_is_owned(page, io));
+
+ if (crt == CRT_READ)
+ /*
+ * in CRT_WRITE case page is left locked even in case of
+ * error.
+ */
+ cl_page_list_disown(env, io, &queue->c2_qin);
+ cl_2queue_fini(env, queue);
+
+ return result;
+}
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
+ struct cl_object *obj, struct cl_page *pg,
+ struct ccc_page *cp,
+ unsigned from, unsigned to)
+{
+ struct cl_attr *attr = ccc_env_thread_attr(env);
+ loff_t offset = cl_offset(obj, pg->cp_index);
+ int result;
+
+ cl_object_attr_lock(obj);
+ result = cl_object_attr_get(env, obj, attr);
+ cl_object_attr_unlock(obj);
+ if (result == 0) {
+ /*
+ * If are writing to a new page, no need to read old data.
+ * The extent locking will have updated the KMS, and for our
+ * purposes here we can treat it like i_size.
+ */
+ if (attr->cat_kms <= offset) {
+ char *kaddr = kmap_atomic(cp->cpg_page);
+
+ memset(kaddr, 0, cl_page_size(obj));
+ kunmap_atomic(kaddr);
+ } else if (cp->cpg_defer_uptodate)
+ cp->cpg_ra_used = 1;
+ else
+ result = vvp_page_sync_io(env, io, pg, cp, CRT_READ);
+ /*
+ * In older implementations, obdo_refresh_inode is called here
+ * to update the inode because the write might modify the
+ * object info at OST. However, this has been proven useless,
+ * since LVB functions will be called when user space program
+ * tries to retrieve inode attribute. Also, see bug 15909 for
+ * details. -jay
+ */
+ if (result == 0)
+ cl_page_export(env, pg, 1);
+ }
+ return result;
+}
+
+static int vvp_io_prepare_write(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ const struct cl_page_slice *slice,
+ unsigned from, unsigned to)
+{
+ struct cl_object *obj = slice->cpl_obj;
+ struct ccc_page *cp = cl2ccc_page(slice);
+ struct cl_page *pg = slice->cpl_page;
+ struct page *vmpage = cp->cpg_page;
+
+ int result;
+
+ LINVRNT(cl_page_is_vmlocked(env, pg));
+ LASSERT(vmpage->mapping->host == ccc_object_inode(obj));
+
+ result = 0;
+
+ CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to);
+ if (!PageUptodate(vmpage)) {
+ /*
+ * We're completely overwriting an existing page, so _don't_
+ * set it up to date until commit_write
+ */
+ if (from == 0 && to == PAGE_CACHE_SIZE) {
+ CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
+ POISON_PAGE(page, 0x11);
+ } else
+ result = vvp_io_prepare_partial(env, ios->cis_io, obj,
+ pg, cp, from, to);
+ } else
+ CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n");
+ return result;
+}
+
+static int vvp_io_commit_write(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ const struct cl_page_slice *slice,
+ unsigned from, unsigned to)
+{
+ struct cl_object *obj = slice->cpl_obj;
+ struct cl_io *io = ios->cis_io;
+ struct ccc_page *cp = cl2ccc_page(slice);
+ struct cl_page *pg = slice->cpl_page;
+ struct inode *inode = ccc_object_inode(obj);
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct page *vmpage = cp->cpg_page;
+
+ int result;
+ int tallyop;
+ loff_t size;
+
+ LINVRNT(cl_page_is_vmlocked(env, pg));
+ LASSERT(vmpage->mapping->host == inode);
+
+ LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "committing page write\n");
+ CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);
+
+ /*
+ * queue a write for some time in the future the first time we
+ * dirty the page.
+ *
+ * This is different from what other file systems do: they usually
+ * just mark page (and some of its buffers) dirty and rely on
+ * balance_dirty_pages() to start a write-back. Lustre wants write-back
+ * to be started earlier for the following reasons:
+ *
+ * (1) with a large number of clients we need to limit the amount
+ * of cached data on the clients a lot;
+ *
+ * (2) large compute jobs generally want compute-only then io-only
+ * and the IO should complete as quickly as possible;
+ *
+ * (3) IO is batched up to the RPC size and is async until the
+ * client max cache is hit
+ * (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
+ *
+ */
+ if (!PageDirty(vmpage)) {
+ tallyop = LPROC_LL_DIRTY_MISSES;
+ result = cl_page_cache_add(env, io, pg, CRT_WRITE);
+ if (result == 0) {
+ /* page was added into cache successfully. */
+ set_page_dirty(vmpage);
+ vvp_write_pending(cl2ccc(obj), cp);
+ } else if (result == -EDQUOT) {
+ pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+ bool need_clip = true;
+
+ /*
+ * Client ran out of disk space grant. Possible
+ * strategies are:
+ *
+ * (a) do a sync write, renewing grant;
+ *
+ * (b) stop writing on this stripe, switch to the
+ * next one.
+ *
+ * (b) is a part of "parallel io" design that is the
+ * ultimate goal. (a) is what "old" client did, and
+ * what the new code continues to do for the time
+ * being.
+ */
+ if (last_index > pg->cp_index) {
+ to = PAGE_CACHE_SIZE;
+ need_clip = false;
+ } else if (last_index == pg->cp_index) {
+ int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
+ if (to < size_to)
+ to = size_to;
+ }
+ if (need_clip)
+ cl_page_clip(env, pg, 0, to);
+ result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
+ if (result)
+ CERROR("Write page %lu of inode %p failed %d\n",
+ pg->cp_index, inode, result);
+ }
+ } else {
+ tallyop = LPROC_LL_DIRTY_HITS;
+ result = 0;
+ }
+ ll_stats_ops_tally(sbi, tallyop, 1);
+
+ /* Inode should be marked DIRTY even if no new page was marked DIRTY
+ * because page could have been not flushed between 2 modifications.
+ * It is important the file is marked DIRTY as soon as the I/O is done
+ * Indeed, when cache is flushed, file could be already closed and it
+ * is too late to warn the MDT.
+ * It is acceptable that file is marked DIRTY even if I/O is dropped
+ * for some reasons before being flushed to OST.
+ */
+ if (result == 0) {
+ spin_lock(&lli->lli_lock);
+ lli->lli_flags |= LLIF_DATA_MODIFIED;
+ spin_unlock(&lli->lli_lock);
+ }
+
+ size = cl_offset(obj, pg->cp_index) + to;
+
+ ll_inode_size_lock(inode);
+ if (result == 0) {
+ if (size > i_size_read(inode)) {
+ cl_isize_write_nolock(inode, size);
+ CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ (unsigned long)size);
+ }
+ cl_page_export(env, pg, 1);
+ } else {
+ if (size > i_size_read(inode))
+ cl_page_discard(env, io, pg);
+ }
+ ll_inode_size_unlock(inode);
+ return result;
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+ .op = {
+ [CIT_READ] = {
+ .cio_fini = vvp_io_read_fini,
+ .cio_lock = vvp_io_read_lock,
+ .cio_start = vvp_io_read_start,
+ .cio_advance = ccc_io_advance
+ },
+ [CIT_WRITE] = {
+ .cio_fini = vvp_io_fini,
+ .cio_lock = vvp_io_write_lock,
+ .cio_start = vvp_io_write_start,
+ .cio_advance = ccc_io_advance
+ },
+ [CIT_SETATTR] = {
+ .cio_fini = vvp_io_setattr_fini,
+ .cio_iter_init = vvp_io_setattr_iter_init,
+ .cio_lock = vvp_io_setattr_lock,
+ .cio_start = vvp_io_setattr_start,
+ .cio_end = vvp_io_setattr_end
+ },
+ [CIT_FAULT] = {
+ .cio_fini = vvp_io_fault_fini,
+ .cio_iter_init = vvp_io_fault_iter_init,
+ .cio_lock = vvp_io_fault_lock,
+ .cio_start = vvp_io_fault_start,
+ .cio_end = ccc_io_end
+ },
+ [CIT_FSYNC] = {
+ .cio_start = vvp_io_fsync_start,
+ .cio_fini = vvp_io_fini
+ },
+ [CIT_MISC] = {
+ .cio_fini = vvp_io_fini
+ }
+ },
+ .cio_read_page = vvp_io_read_page,
+ .cio_prepare_write = vvp_io_prepare_write,
+ .cio_commit_write = vvp_io_commit_write
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io)
+{
+ struct vvp_io *vio = vvp_env_io(env);
+ struct ccc_io *cio = ccc_env_io(env);
+ struct inode *inode = ccc_object_inode(obj);
+ int result;
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+ CDEBUG(D_VFSTRACE, DFID
+ " ignore/verify layout %d/%d, layout version %d restore needed %d\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ io->ci_ignore_layout, io->ci_verify_layout,
+ cio->cui_layout_gen, io->ci_restore_needed);
+
+ CL_IO_SLICE_CLEAN(cio, cui_cl);
+ cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
+ vio->cui_ra_window_set = 0;
+ result = 0;
+ if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+ size_t count;
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ count = io->u.ci_rw.crw_count;
+ /* "If nbyte is 0, read() will return 0 and have no other
+ * results." -- Single Unix Spec */
+ if (count == 0)
+ result = 1;
+ else
+ cio->cui_tot_count = count;
+
+ /* for read/write, we store the jobid in the inode, and
+ * it'll be fetched by osc when building RPC.
+ *
+ * it's not accurate if the file is shared by different
+ * jobs.
+ */
+ lustre_get_jobid(lli->lli_jobid);
+ } else if (io->ci_type == CIT_SETATTR) {
+ if (!cl_io_is_trunc(io))
+ io->ci_lockreq = CILR_MANDATORY;
+ }
+
+ /* ignore layout change for generic CIT_MISC but not for glimpse.
+ * io context for glimpse must set ci_verify_layout to true,
+ * see cl_glimpse_size0() for details. */
+ if (io->ci_type == CIT_MISC && !io->ci_verify_layout)
+ io->ci_ignore_layout = 1;
+
+ /* Enqueue layout lock and get layout version. We need to do this
+ * even for operations requiring to open file, such as read and write,
+ * because it might not grant layout lock in IT_OPEN. */
+ if (result == 0 && !io->ci_ignore_layout) {
+ result = ll_layout_refresh(inode, &cio->cui_layout_gen);
+ if (result == -ENOENT)
+ /* If the inode on MDS has been removed, but the objects
+ * on OSTs haven't been destroyed (async unlink), layout
+ * fetch will return -ENOENT, we'd ignore this error
+ * and continue with dirty flush. LU-3230. */
+ result = 0;
+ if (result < 0)
+ CERROR("%s: refresh file layout " DFID " error %d.\n",
+ ll_get_fsname(inode->i_sb, NULL, 0),
+ PFID(lu_object_fid(&obj->co_lu)), result);
+ }
+
+ return result;
+}
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ /* Calling just for assertion */
+ cl2ccc_io(env, slice);
+ return vvp_env_io(env);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c
new file mode 100644
index 000000000..f354e82d4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_lock.c
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+/**
+ * Estimates lock value for the purpose of managing the lock cache during
+ * memory shortages.
+ *
+ * Locks for memory mapped files are almost infinitely precious, others are
+ * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are
+ * ordered within themselves by weights assigned from other layers.
+ */
+static unsigned long vvp_lock_weigh(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct ccc_object *cob = cl2ccc(slice->cls_obj);
+
+ return atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0;
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+ .clo_delete = ccc_lock_delete,
+ .clo_fini = ccc_lock_fini,
+ .clo_enqueue = ccc_lock_enqueue,
+ .clo_wait = ccc_lock_wait,
+ .clo_use = ccc_lock_use,
+ .clo_unuse = ccc_lock_unuse,
+ .clo_fits_into = ccc_lock_fits_into,
+ .clo_state = ccc_lock_state,
+ .clo_weigh = vvp_lock_weigh
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io)
+{
+ return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c
new file mode 100644
index 000000000..b6f6d4cb6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_object.c
@@ -0,0 +1,201 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o)
+{
+ struct ccc_object *obj = lu2ccc(o);
+ struct inode *inode = obj->cob_inode;
+ struct ll_inode_info *lli;
+
+ (*p)(env, cookie, "(%s %d %d) inode: %p ",
+ list_empty(&obj->cob_pending_list) ? "-" : "+",
+ obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt),
+ inode);
+ if (inode) {
+ lli = ll_i2info(inode);
+ (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+ inode->i_ino, inode->i_generation, inode->i_mode,
+ inode->i_nlink, atomic_read(&inode->i_count),
+ lli->lli_clob, PFID(&lli->lli_fid));
+ }
+ return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr)
+{
+ struct inode *inode = ccc_object_inode(obj);
+
+ /*
+ * lov overwrites most of these fields in
+ * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+ * attributes are newer.
+ */
+
+ attr->cat_size = i_size_read(inode);
+ attr->cat_mtime = LTIME_S(inode->i_mtime);
+ attr->cat_atime = LTIME_S(inode->i_atime);
+ attr->cat_ctime = LTIME_S(inode->i_ctime);
+ attr->cat_blocks = inode->i_blocks;
+ attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid);
+ attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid);
+ /* KMS is not known by this layer */
+ return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid)
+{
+ struct inode *inode = ccc_object_inode(obj);
+
+ if (valid & CAT_UID)
+ inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid);
+ if (valid & CAT_GID)
+ inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid);
+ if (valid & CAT_ATIME)
+ LTIME_S(inode->i_atime) = attr->cat_atime;
+ if (valid & CAT_MTIME)
+ LTIME_S(inode->i_mtime) = attr->cat_mtime;
+ if (valid & CAT_CTIME)
+ LTIME_S(inode->i_ctime) = attr->cat_ctime;
+ if (0 && valid & CAT_SIZE)
+ cl_isize_write_nolock(inode, attr->cat_size);
+ /* not currently necessary */
+ if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE))
+ mark_inode_dirty(inode);
+ return 0;
+}
+
+static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf)
+{
+ struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+ if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+ CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n",
+ PFID(&lli->lli_fid));
+
+ ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE);
+
+ /* Clean up page mmap for this inode.
+ * The reason for us to do this is that if the page has
+ * already been installed into memory space, the process
+ * can access it without interacting with lustre, so this
+ * page may be stale due to layout change, and the process
+ * will never be notified.
+ * This operation is expensive but mmap processes have to pay
+ * a price themselves. */
+ unmap_mapping_range(conf->coc_inode->i_mapping,
+ 0, OBD_OBJECT_EOF, 0);
+
+ return 0;
+ }
+
+ if (conf->coc_opc != OBJECT_CONF_SET)
+ return 0;
+
+ if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) {
+ CDEBUG(D_VFSTRACE, DFID ": layout version change: %u -> %u\n",
+ PFID(&lli->lli_fid), lli->lli_layout_gen,
+ conf->u.coc_md->lsm->lsm_layout_gen);
+
+ lli->lli_has_smd = lsm_has_objects(conf->u.coc_md->lsm);
+ ll_layout_version_set(lli, conf->u.coc_md->lsm->lsm_layout_gen);
+ } else {
+ CDEBUG(D_VFSTRACE, DFID ": layout nuked: %u.\n",
+ PFID(&lli->lli_fid), lli->lli_layout_gen);
+
+ lli->lli_has_smd = false;
+ ll_layout_version_set(lli, LL_LAYOUT_GEN_EMPTY);
+ }
+ return 0;
+}
+
+static const struct cl_object_operations vvp_ops = {
+ .coo_page_init = vvp_page_init,
+ .coo_lock_init = vvp_lock_init,
+ .coo_io_init = vvp_io_init,
+ .coo_attr_get = vvp_attr_get,
+ .coo_attr_set = vvp_attr_set,
+ .coo_conf_set = vvp_conf_set,
+ .coo_glimpse = ccc_object_glimpse
+};
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+ .loo_object_init = ccc_object_init,
+ .loo_object_free = ccc_object_free,
+ .loo_object_print = vvp_object_print
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode)
+{
+ struct cl_inode_info *lli = cl_i2info(inode);
+ struct cl_object *obj = lli->lli_clob;
+ struct lu_object *lu;
+
+ LASSERT(obj != NULL);
+ lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+ LASSERT(lu != NULL);
+ return lu2ccc(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev)
+{
+ return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
new file mode 100644
index 000000000..954ed08c6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/vvp_page.c
@@ -0,0 +1,551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include "../include/obd.h"
+#include "../include/lustre_lite.h"
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct ccc_page *cp)
+{
+ struct page *vmpage = cp->cpg_page;
+
+ LASSERT(vmpage != NULL);
+ page_cache_release(vmpage);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+ struct cl_page_slice *slice)
+{
+ struct ccc_page *cp = cl2ccc_page(slice);
+ struct page *vmpage = cp->cpg_page;
+
+ /*
+ * vmpage->private was already cleared when page was moved into
+ * VPG_FREEING state.
+ */
+ LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+ vvp_page_fini_common(cp);
+}
+
+static int vvp_page_own(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io,
+ int nonblock)
+{
+ struct ccc_page *vpg = cl2ccc_page(slice);
+ struct page *vmpage = vpg->cpg_page;
+
+ LASSERT(vmpage != NULL);
+ if (nonblock) {
+ if (!trylock_page(vmpage))
+ return -EAGAIN;
+
+ if (unlikely(PageWriteback(vmpage))) {
+ unlock_page(vmpage);
+ return -EAGAIN;
+ }
+
+ return 0;
+ }
+
+ lock_page(vmpage);
+ wait_on_page_writeback(vmpage);
+ return 0;
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct page *vmpage = cl2vm_page(slice);
+
+ LASSERT(vmpage != NULL);
+ LASSERT(PageLocked(vmpage));
+ wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct page *vmpage = cl2vm_page(slice);
+
+ LASSERT(vmpage != NULL);
+ LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io)
+{
+ struct page *vmpage = cl2vm_page(slice);
+
+ LASSERT(vmpage != NULL);
+ LASSERT(PageLocked(vmpage));
+
+ unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct page *vmpage = cl2vm_page(slice);
+ struct address_space *mapping;
+ struct ccc_page *cpg = cl2ccc_page(slice);
+
+ LASSERT(vmpage != NULL);
+ LASSERT(PageLocked(vmpage));
+
+ mapping = vmpage->mapping;
+
+ if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used)
+ ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
+
+ /*
+ * truncate_complete_page() calls
+ * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+ */
+ truncate_complete_page(mapping, vmpage);
+}
+
+static int vvp_page_unmap(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct page *vmpage = cl2vm_page(slice);
+ __u64 offset;
+
+ LASSERT(vmpage != NULL);
+ LASSERT(PageLocked(vmpage));
+
+ offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+ /*
+ * XXX is it safe to call this with the page lock held?
+ */
+ ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE);
+ return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ struct page *vmpage = cl2vm_page(slice);
+ struct inode *inode = vmpage->mapping->host;
+ struct cl_object *obj = slice->cpl_obj;
+
+ LASSERT(PageLocked(vmpage));
+ LASSERT((struct cl_page *)vmpage->private == slice->cpl_page);
+ LASSERT(inode == ccc_object_inode(obj));
+
+ vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice));
+ ClearPagePrivate(vmpage);
+ vmpage->private = 0;
+ /*
+ * Reference from vmpage to cl_page is removed, but the reference back
+ * is still here. It is removed later in vvp_page_fini().
+ */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int uptodate)
+{
+ struct page *vmpage = cl2vm_page(slice);
+
+ LASSERT(vmpage != NULL);
+ LASSERT(PageLocked(vmpage));
+ if (uptodate)
+ SetPageUptodate(vmpage);
+ else
+ ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ /* Skip the page already marked as PG_uptodate. */
+ return PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0;
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct page *vmpage = cl2vm_page(slice);
+
+ LASSERT(PageLocked(vmpage));
+ LASSERT(!PageDirty(vmpage));
+
+ set_page_writeback(vmpage);
+ vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice));
+
+ return 0;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+{
+ struct ccc_object *obj = cl_inode2ccc(inode);
+
+ if (ioret == 0) {
+ ClearPageError(vmpage);
+ obj->cob_discard_page_warned = 0;
+ } else {
+ SetPageError(vmpage);
+ if (ioret == -ENOSPC)
+ set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+ else
+ set_bit(AS_EIO, &inode->i_mapping->flags);
+
+ if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+ obj->cob_discard_page_warned == 0) {
+ obj->cob_discard_page_warned = 1;
+ ll_dirty_page_discard_warn(vmpage, ioret);
+ }
+ }
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret)
+{
+ struct ccc_page *cp = cl2ccc_page(slice);
+ struct page *vmpage = cp->cpg_page;
+ struct cl_page *page = cl_page_top(slice->cpl_page);
+ struct inode *inode = ccc_object_inode(page->cp_obj);
+
+ LASSERT(PageLocked(vmpage));
+ CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+ if (cp->cpg_defer_uptodate)
+ ll_ra_count_put(ll_i2sbi(inode), 1);
+
+ if (ioret == 0) {
+ if (!cp->cpg_defer_uptodate)
+ cl_page_export(env, page, 1);
+ } else
+ cp->cpg_defer_uptodate = 0;
+
+ if (page->cp_sync_io == NULL)
+ unlock_page(vmpage);
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret)
+{
+ struct ccc_page *cp = cl2ccc_page(slice);
+ struct cl_page *pg = slice->cpl_page;
+ struct page *vmpage = cp->cpg_page;
+
+ LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage)));
+ LASSERT(PageWriteback(vmpage));
+
+ CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+ /*
+ * TODO: Actually it makes sense to add the page into oap pending
+ * list again and so that we don't need to take the page out from
+ * SoM write pending list, if we just meet a recoverable error,
+ * -ENOMEM, etc.
+ * To implement this, we just need to return a non zero value in
+ * ->cpo_completion method. The underlying transfer should be notified
+ * and then re-add the page into pending transfer queue. -jay
+ */
+
+ cp->cpg_write_queued = 0;
+ vvp_write_complete(cl2ccc(slice->cpl_obj), cp);
+
+ /*
+ * Only mark the page error only when it's an async write because
+ * applications won't wait for IO to finish.
+ */
+ if (pg->cp_sync_io == NULL)
+ vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret);
+
+ end_page_writeback(vmpage);
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0 success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ struct page *vmpage = cl2vm_page(slice);
+ struct cl_page *pg = slice->cpl_page;
+ int result = 0;
+
+ lock_page(vmpage);
+ if (clear_page_dirty_for_io(vmpage)) {
+ LASSERT(pg->cp_state == CPS_CACHED);
+ /* This actually clears the dirty bit in the radix
+ * tree. */
+ set_page_writeback(vmpage);
+ vvp_write_pending(cl2ccc(slice->cpl_obj),
+ cl2ccc_page(slice));
+ CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+ } else if (pg->cp_state == CPS_PAGEOUT) {
+ /* is it possible for osc_flush_async_page() to already
+ * make it ready? */
+ result = -EALREADY;
+ } else {
+ CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+ pg->cp_state);
+ LBUG();
+ }
+ unlock_page(vmpage);
+ return result;
+}
+
+static int vvp_page_print(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t printer)
+{
+ struct ccc_page *vp = cl2ccc_page(slice);
+ struct page *vmpage = vp->cpg_page;
+
+ (*printer)(env, cookie, LUSTRE_VVP_NAME "-page@%p(%d:%d:%d) vm@%p ",
+ vp, vp->cpg_defer_uptodate, vp->cpg_ra_used,
+ vp->cpg_write_queued, vmpage);
+ if (vmpage != NULL) {
+ (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+ (long)vmpage->flags, page_count(vmpage),
+ page_mapcount(vmpage), vmpage->private,
+ page_index(vmpage),
+ list_empty(&vmpage->lru) ? "not-" : "");
+ }
+ (*printer)(env, cookie, "\n");
+ return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+ .cpo_own = vvp_page_own,
+ .cpo_assume = vvp_page_assume,
+ .cpo_unassume = vvp_page_unassume,
+ .cpo_disown = vvp_page_disown,
+ .cpo_vmpage = ccc_page_vmpage,
+ .cpo_discard = vvp_page_discard,
+ .cpo_delete = vvp_page_delete,
+ .cpo_unmap = vvp_page_unmap,
+ .cpo_export = vvp_page_export,
+ .cpo_is_vmlocked = vvp_page_is_vmlocked,
+ .cpo_fini = vvp_page_fini,
+ .cpo_print = vvp_page_print,
+ .cpo_is_under_lock = ccc_page_is_under_lock,
+ .io = {
+ [CRT_READ] = {
+ .cpo_prep = vvp_page_prep_read,
+ .cpo_completion = vvp_page_completion_read,
+ .cpo_make_ready = ccc_fail,
+ },
+ [CRT_WRITE] = {
+ .cpo_prep = vvp_page_prep_write,
+ .cpo_completion = vvp_page_completion_write,
+ .cpo_make_ready = vvp_page_make_ready,
+ }
+ }
+};
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+ struct inode *inode = ccc_object_inode(page->cp_obj);
+
+ LASSERT(!mutex_trylock(&inode->i_mutex));
+}
+
+static int vvp_transient_page_own(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused, int nonblock)
+{
+ vvp_transient_page_verify(slice->cpl_page);
+ return 0;
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct cl_page *page = slice->cpl_page;
+
+ vvp_transient_page_verify(slice->cpl_page);
+
+ /*
+ * For transient pages, remove it from the radix tree.
+ */
+ cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ struct inode *inode = ccc_object_inode(slice->cpl_obj);
+ int locked;
+
+ locked = !mutex_trylock(&inode->i_mutex);
+ if (!locked)
+ mutex_unlock(&inode->i_mutex);
+ return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret)
+{
+ vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+ struct cl_page_slice *slice)
+{
+ struct ccc_page *cp = cl2ccc_page(slice);
+ struct cl_page *clp = slice->cpl_page;
+ struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+ vvp_page_fini_common(cp);
+ LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+ clobj->cob_transient_pages--;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+ .cpo_own = vvp_transient_page_own,
+ .cpo_assume = vvp_transient_page_assume,
+ .cpo_unassume = vvp_transient_page_unassume,
+ .cpo_disown = vvp_transient_page_disown,
+ .cpo_discard = vvp_transient_page_discard,
+ .cpo_vmpage = ccc_page_vmpage,
+ .cpo_fini = vvp_transient_page_fini,
+ .cpo_is_vmlocked = vvp_transient_page_is_vmlocked,
+ .cpo_print = vvp_page_print,
+ .cpo_is_under_lock = ccc_page_is_under_lock,
+ .io = {
+ [CRT_READ] = {
+ .cpo_prep = ccc_transient_page_prep,
+ .cpo_completion = vvp_transient_page_completion,
+ },
+ [CRT_WRITE] = {
+ .cpo_prep = ccc_transient_page_prep,
+ .cpo_completion = vvp_transient_page_completion,
+ }
+ }
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage)
+{
+ struct ccc_page *cpg = cl_object_page_slice(obj, page);
+
+ CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+ cpg->cpg_page = vmpage;
+ page_cache_get(vmpage);
+
+ INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
+ if (page->cp_type == CPT_CACHEABLE) {
+ SetPagePrivate(vmpage);
+ vmpage->private = (unsigned long)page;
+ cl_page_slice_add(page, &cpg->cpg_cl, obj,
+ &vvp_page_ops);
+ } else {
+ struct ccc_object *clobj = cl2ccc(obj);
+
+ LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+ cl_page_slice_add(page, &cpg->cpg_cl, obj,
+ &vvp_transient_page_ops);
+ clobj->cob_transient_pages++;
+ }
+ return 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
new file mode 100644
index 000000000..e0fcbe139
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/xattr.c
@@ -0,0 +1,621 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "../include/lustre_eacl.h"
+
+#include "llite_internal.h"
+
+#define XATTR_USER_T (1)
+#define XATTR_TRUSTED_T (2)
+#define XATTR_SECURITY_T (3)
+#define XATTR_ACL_ACCESS_T (4)
+#define XATTR_ACL_DEFAULT_T (5)
+#define XATTR_LUSTRE_T (6)
+#define XATTR_OTHER_T (7)
+
+static
+int get_xattr_type(const char *name)
+{
+ if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+ return XATTR_ACL_ACCESS_T;
+
+ if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+ return XATTR_ACL_DEFAULT_T;
+
+ if (!strncmp(name, XATTR_USER_PREFIX,
+ sizeof(XATTR_USER_PREFIX) - 1))
+ return XATTR_USER_T;
+
+ if (!strncmp(name, XATTR_TRUSTED_PREFIX,
+ sizeof(XATTR_TRUSTED_PREFIX) - 1))
+ return XATTR_TRUSTED_T;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ sizeof(XATTR_SECURITY_PREFIX) - 1))
+ return XATTR_SECURITY_T;
+
+ if (!strncmp(name, XATTR_LUSTRE_PREFIX,
+ sizeof(XATTR_LUSTRE_PREFIX) - 1))
+ return XATTR_LUSTRE_T;
+
+ return XATTR_OTHER_T;
+}
+
+static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+ if ((xattr_type == XATTR_ACL_ACCESS_T ||
+ xattr_type == XATTR_ACL_DEFAULT_T) &&
+ !(sbi->ll_flags & LL_SBI_ACL))
+ return -EOPNOTSUPP;
+
+ if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+ return -EOPNOTSUPP;
+ if (xattr_type == XATTR_TRUSTED_T && !capable(CFS_CAP_SYS_ADMIN))
+ return -EPERM;
+ if (xattr_type == XATTR_OTHER_T)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static
+int ll_setxattr_common(struct inode *inode, const char *name,
+ const void *value, size_t size,
+ int flags, __u64 valid)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ptlrpc_request *req = NULL;
+ int xattr_type, rc;
+ struct obd_capa *oc;
+#ifdef CONFIG_FS_POSIX_ACL
+ struct rmtacl_ctl_entry *rce = NULL;
+ posix_acl_xattr_header *new_value = NULL;
+ ext_acl_xattr_header *acl = NULL;
+#endif
+ const char *pv = value;
+
+ xattr_type = get_xattr_type(name);
+ rc = xattr_type_filter(sbi, xattr_type);
+ if (rc)
+ return rc;
+
+ if ((xattr_type == XATTR_ACL_ACCESS_T ||
+ xattr_type == XATTR_ACL_DEFAULT_T) &&
+ !inode_owner_or_capable(inode))
+ return -EPERM;
+
+ /* b10667: ignore lustre special xattr for now */
+ if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) ||
+ (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0))
+ return 0;
+
+ /* b15587: ignore security.capability xattr for now */
+ if ((xattr_type == XATTR_SECURITY_T &&
+ strcmp(name, "security.capability") == 0))
+ return 0;
+
+ /* LU-549: Disable security.selinux when selinux is disabled */
+ if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+ strcmp(name, "security.selinux") == 0)
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+ (xattr_type == XATTR_ACL_ACCESS_T ||
+ xattr_type == XATTR_ACL_DEFAULT_T)) {
+ rce = rct_search(&sbi->ll_rct, current_pid());
+ if (rce == NULL ||
+ (rce->rce_ops != RMT_LSETFACL &&
+ rce->rce_ops != RMT_RSETFACL))
+ return -EOPNOTSUPP;
+
+ if (rce->rce_ops == RMT_LSETFACL) {
+ struct eacl_entry *ee;
+
+ ee = et_search_del(&sbi->ll_et, current_pid(),
+ ll_inode2fid(inode), xattr_type);
+ LASSERT(ee != NULL);
+ if (valid & OBD_MD_FLXATTR) {
+ acl = lustre_acl_xattr_merge2ext(
+ (posix_acl_xattr_header *)value,
+ size, ee->ee_acl);
+ if (IS_ERR(acl)) {
+ ee_free(ee);
+ return PTR_ERR(acl);
+ }
+ size = CFS_ACL_XATTR_SIZE(\
+ le32_to_cpu(acl->a_count), \
+ ext_acl_xattr);
+ pv = (const char *)acl;
+ }
+ ee_free(ee);
+ } else if (rce->rce_ops == RMT_RSETFACL) {
+ size = lustre_posix_acl_xattr_filter(
+ (posix_acl_xattr_header *)value,
+ size, &new_value);
+ if (unlikely(size < 0))
+ return size;
+
+ pv = (const char *)new_value;
+ } else
+ return -EOPNOTSUPP;
+
+ valid |= rce_ops2valid(rce->rce_ops);
+ }
+#endif
+ oc = ll_mdscapa_get(inode);
+ rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+ valid, name, pv, size, 0, flags,
+ ll_i2suppgid(inode), &req);
+ capa_put(oc);
+#ifdef CONFIG_FS_POSIX_ACL
+ if (new_value != NULL)
+ lustre_posix_acl_xattr_free(new_value, size);
+ if (acl != NULL)
+ lustre_ext_acl_xattr_free(acl);
+#endif
+ if (rc) {
+ if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+ LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
+ sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+ }
+ return rc;
+ }
+
+ ptlrpc_req_finished(req);
+ return 0;
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct inode *inode = d_inode(dentry);
+
+ LASSERT(inode);
+ LASSERT(name);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+ inode->i_ino, inode->i_generation, inode, name);
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+
+ if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+ sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+ strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+ (strncmp(name, XATTR_LUSTRE_PREFIX,
+ sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+ strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+ struct lov_user_md *lump = (struct lov_user_md *)value;
+ int rc = 0;
+
+ if (size != 0 && size < sizeof(struct lov_user_md))
+ return -EINVAL;
+
+ /* Attributes that are saved via getxattr will always have
+ * the stripe_offset as 0. Instead, the MDS should be
+ * allowed to pick the starting OST index. b=17846 */
+ if (lump != NULL && lump->lmm_stripe_offset == 0)
+ lump->lmm_stripe_offset = -1;
+
+ if (lump != NULL && S_ISREG(inode->i_mode)) {
+ int flags = FMODE_WRITE;
+ int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ?
+ sizeof(*lump) : sizeof(struct lov_user_md_v3);
+
+ rc = ll_lov_setstripe_ea_info(inode, dentry, flags, lump,
+ lum_size);
+ /* b10667: rc always be 0 here for now */
+ rc = 0;
+ } else if (S_ISDIR(inode->i_mode)) {
+ rc = ll_dir_setstripe(inode, lump, 0);
+ }
+
+ return rc;
+
+ } else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
+ strcmp(name, XATTR_NAME_LINK) == 0)
+ return 0;
+
+ return ll_setxattr_common(inode, name, value, size, flags,
+ OBD_MD_FLXATTR);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode = d_inode(dentry);
+
+ LASSERT(inode);
+ LASSERT(name);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+ inode->i_ino, inode->i_generation, inode, name);
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+ return ll_setxattr_common(inode, name, NULL, 0, 0,
+ OBD_MD_FLXATTRRM);
+}
+
+static
+int ll_getxattr_common(struct inode *inode, const char *name,
+ void *buffer, size_t size, __u64 valid)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ptlrpc_request *req = NULL;
+ struct mdt_body *body;
+ int xattr_type, rc;
+ void *xdata;
+ struct obd_capa *oc;
+ struct rmtacl_ctl_entry *rce = NULL;
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+ inode->i_ino, inode->i_generation, inode);
+
+ /* listxattr have slightly different behavior from of ext3:
+ * without 'user_xattr' ext3 will list all xattr names but
+ * filtered out "^user..*"; we list them all for simplicity.
+ */
+ if (!name) {
+ xattr_type = XATTR_OTHER_T;
+ goto do_getxattr;
+ }
+
+ xattr_type = get_xattr_type(name);
+ rc = xattr_type_filter(sbi, xattr_type);
+ if (rc)
+ return rc;
+
+ /* b15587: ignore security.capability xattr for now */
+ if ((xattr_type == XATTR_SECURITY_T &&
+ strcmp(name, "security.capability") == 0))
+ return -ENODATA;
+
+ /* LU-549: Disable security.selinux when selinux is disabled */
+ if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+ strcmp(name, "security.selinux") == 0)
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+ (xattr_type == XATTR_ACL_ACCESS_T ||
+ xattr_type == XATTR_ACL_DEFAULT_T)) {
+ rce = rct_search(&sbi->ll_rct, current_pid());
+ if (rce == NULL ||
+ (rce->rce_ops != RMT_LSETFACL &&
+ rce->rce_ops != RMT_LGETFACL &&
+ rce->rce_ops != RMT_RSETFACL &&
+ rce->rce_ops != RMT_RGETFACL))
+ return -EOPNOTSUPP;
+ }
+
+ /* posix acl is under protection of LOOKUP lock. when calling to this,
+ * we just have path resolution to the target inode, so we have great
+ * chance that cached ACL is uptodate.
+ */
+ if (xattr_type == XATTR_ACL_ACCESS_T &&
+ !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+
+ struct posix_acl *acl;
+
+ spin_lock(&lli->lli_lock);
+ acl = posix_acl_dup(lli->lli_posix_acl);
+ spin_unlock(&lli->lli_lock);
+
+ if (!acl)
+ return -ENODATA;
+
+ rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+ posix_acl_release(acl);
+ return rc;
+ }
+ if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+ return -ENODATA;
+#endif
+
+do_getxattr:
+ if (sbi->ll_xattr_cache_enabled && xattr_type != XATTR_ACL_ACCESS_T) {
+ rc = ll_xattr_cache_get(inode, name, buffer, size, valid);
+ if (rc == -EAGAIN)
+ goto getxattr_nocache;
+ if (rc < 0)
+ goto out_xattr;
+
+ /* Add "system.posix_acl_access" to the list */
+ if (lli->lli_posix_acl != NULL && valid & OBD_MD_FLXATTRLS) {
+ if (size == 0) {
+ rc += sizeof(XATTR_NAME_ACL_ACCESS);
+ } else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) {
+ memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS,
+ sizeof(XATTR_NAME_ACL_ACCESS));
+ rc += sizeof(XATTR_NAME_ACL_ACCESS);
+ } else {
+ rc = -ERANGE;
+ goto out_xattr;
+ }
+ }
+ } else {
+getxattr_nocache:
+ oc = ll_mdscapa_get(inode);
+ rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+ valid | (rce ? rce_ops2valid(rce->rce_ops) : 0),
+ name, NULL, 0, size, 0, &req);
+ capa_put(oc);
+
+ if (rc < 0)
+ goto out_xattr;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body);
+
+ /* only detect the xattr size */
+ if (size == 0) {
+ rc = body->eadatasize;
+ goto out;
+ }
+
+ if (size < body->eadatasize) {
+ CERROR("server bug: replied size %u > %u\n",
+ body->eadatasize, (int)size);
+ rc = -ERANGE;
+ goto out;
+ }
+
+ if (body->eadatasize == 0) {
+ rc = -ENODATA;
+ goto out;
+ }
+
+ /* do not need swab xattr data */
+ xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+ body->eadatasize);
+ if (!xdata) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ memcpy(buffer, xdata, body->eadatasize);
+ rc = body->eadatasize;
+ }
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (rce && rce->rce_ops == RMT_LSETFACL) {
+ ext_acl_xattr_header *acl;
+
+ acl = lustre_posix_acl_xattr_2ext(
+ (posix_acl_xattr_header *)buffer, rc);
+ if (IS_ERR(acl)) {
+ rc = PTR_ERR(acl);
+ goto out;
+ }
+
+ rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode),
+ xattr_type, acl);
+ if (unlikely(rc < 0)) {
+ lustre_ext_acl_xattr_free(acl);
+ goto out;
+ }
+ }
+#endif
+
+out_xattr:
+ if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+ LCONSOLE_INFO(
+ "%s: disabling user_xattr feature because it is not supported on the server: rc = %d\n",
+ ll_get_fsname(inode->i_sb, NULL, 0), rc);
+ sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+ }
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+
+ LASSERT(inode);
+ LASSERT(name);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+ inode->i_ino, inode->i_generation, inode, name);
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+ if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+ sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+ strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+ (strncmp(name, XATTR_LUSTRE_PREFIX,
+ sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+ strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+ struct lov_stripe_md *lsm;
+ struct lov_user_md *lump;
+ struct lov_mds_md *lmm = NULL;
+ struct ptlrpc_request *request = NULL;
+ int rc = 0, lmmsize = 0;
+
+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+ return -ENODATA;
+
+ if (size == 0 && S_ISDIR(inode->i_mode)) {
+ /* XXX directory EA is fix for now, optimize to save
+ * RPC transfer */
+ rc = sizeof(struct lov_user_md);
+ goto out;
+ }
+
+ lsm = ccc_inode_lsm_get(inode);
+ if (lsm == NULL) {
+ if (S_ISDIR(inode->i_mode)) {
+ rc = ll_dir_getstripe(inode, &lmm,
+ &lmmsize, &request);
+ } else {
+ rc = -ENODATA;
+ }
+ } else {
+ /* LSM is present already after lookup/getattr call.
+ * we need to grab layout lock once it is implemented */
+ rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm);
+ lmmsize = rc;
+ }
+ ccc_inode_lsm_put(inode, lsm);
+
+ if (rc < 0)
+ goto out;
+
+ if (size == 0) {
+ /* used to call ll_get_max_mdsize() forward to get
+ * the maximum buffer size, while some apps (such as
+ * rsync 3.0.x) care much about the exact xattr value
+ * size */
+ rc = lmmsize;
+ goto out;
+ }
+
+ if (size < lmmsize) {
+ CERROR("server bug: replied size %d > %d for %pd (%s)\n",
+ lmmsize, (int)size, dentry, name);
+ rc = -ERANGE;
+ goto out;
+ }
+
+ lump = (struct lov_user_md *)buffer;
+ memcpy(lump, lmm, lmmsize);
+ /* do not return layout gen for getxattr otherwise it would
+ * confuse tar --xattr by recognizing layout gen as stripe
+ * offset when the file is restored. See LU-2809. */
+ lump->lmm_layout_gen = 0;
+
+ rc = lmmsize;
+out:
+ if (request)
+ ptlrpc_req_finished(request);
+ else if (lmm)
+ obd_free_diskmd(ll_i2dtexp(inode), &lmm);
+ return rc;
+ }
+
+ return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ int rc = 0, rc2 = 0;
+ struct lov_mds_md *lmm = NULL;
+ struct ptlrpc_request *request = NULL;
+ int lmmsize;
+
+ LASSERT(inode);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+ inode->i_ino, inode->i_generation, inode);
+
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+ rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS);
+ if (rc < 0)
+ goto out;
+
+ if (buffer != NULL) {
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ char *xattr_name = buffer;
+ int xlen, rem = rc;
+
+ while (rem > 0) {
+ xlen = strnlen(xattr_name, rem - 1) + 1;
+ rem -= xlen;
+ if (xattr_type_filter(sbi,
+ get_xattr_type(xattr_name)) == 0) {
+ /* skip OK xattr type
+ * leave it in buffer
+ */
+ xattr_name += xlen;
+ continue;
+ }
+ /* move up remaining xattrs in buffer
+ * removing the xattr that is not OK
+ */
+ memmove(xattr_name, xattr_name + xlen, rem);
+ rc -= xlen;
+ }
+ }
+ if (S_ISREG(inode->i_mode)) {
+ if (!ll_i2info(inode)->lli_has_smd)
+ rc2 = -1;
+ } else if (S_ISDIR(inode->i_mode)) {
+ rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+ }
+
+ if (rc2 < 0) {
+ rc2 = 0;
+ goto out;
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+ const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1;
+ const size_t name_len = sizeof("lov") - 1;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (((rc + total_len) > size) && (buffer != NULL)) {
+ ptlrpc_req_finished(request);
+ return -ERANGE;
+ }
+
+ if (buffer != NULL) {
+ buffer += rc;
+ memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len);
+ memcpy(buffer + prefix_len, "lov", name_len);
+ buffer[prefix_len + name_len] = '\0';
+ }
+ rc2 = total_len;
+ }
+out:
+ ptlrpc_req_finished(request);
+ rc = rc + rc2;
+
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/llite/xattr_cache.c b/drivers/staging/lustre/lustre/llite/xattr_cache.c
new file mode 100644
index 000000000..69ea92adf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/llite/xattr_cache.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Author: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include "../include/obd_support.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_ver.h"
+#include "llite_internal.h"
+
+/* If we ever have hundreds of extended attributes, we might want to consider
+ * using a hash or a tree structure instead of list for faster lookups.
+ */
+struct ll_xattr_entry {
+ struct list_head xe_list; /* protected with
+ * lli_xattrs_list_rwsem */
+ char *xe_name; /* xattr name, \0-terminated */
+ char *xe_value; /* xattr value */
+ unsigned xe_namelen; /* strlen(xe_name) + 1 */
+ unsigned xe_vallen; /* xattr value length */
+};
+
+static struct kmem_cache *xattr_kmem;
+static struct lu_kmem_descr xattr_caches[] = {
+ {
+ .ckd_cache = &xattr_kmem,
+ .ckd_name = "xattr_kmem",
+ .ckd_size = sizeof(struct ll_xattr_entry)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+int ll_xattr_init(void)
+{
+ return lu_kmem_init(xattr_caches);
+}
+
+void ll_xattr_fini(void)
+{
+ lu_kmem_fini(xattr_caches);
+}
+
+/**
+ * Initializes xattr cache for an inode.
+ *
+ * This initializes the xattr list and marks cache presence.
+ */
+static void ll_xattr_cache_init(struct ll_inode_info *lli)
+{
+
+
+ LASSERT(lli != NULL);
+
+ INIT_LIST_HEAD(&lli->lli_xattrs);
+ lli->lli_flags |= LLIF_XATTR_CACHE;
+}
+
+/**
+ * This looks for a specific extended attribute.
+ *
+ * Find in @cache and return @xattr_name attribute in @xattr,
+ * for the NULL @xattr_name return the first cached @xattr.
+ *
+ * \retval 0 success
+ * \retval -ENODATA if not found
+ */
+static int ll_xattr_cache_find(struct list_head *cache,
+ const char *xattr_name,
+ struct ll_xattr_entry **xattr)
+{
+ struct ll_xattr_entry *entry;
+
+
+
+ list_for_each_entry(entry, cache, xe_list) {
+ /* xattr_name == NULL means look for any entry */
+ if (xattr_name == NULL ||
+ strcmp(xattr_name, entry->xe_name) == 0) {
+ *xattr = entry;
+ CDEBUG(D_CACHE, "find: [%s]=%.*s\n",
+ entry->xe_name, entry->xe_vallen,
+ entry->xe_value);
+ return 0;
+ }
+ }
+
+ return -ENODATA;
+}
+
+/**
+ * This adds an xattr.
+ *
+ * Add @xattr_name attr with @xattr_val value and @xattr_val_len length,
+ *
+ * \retval 0 success
+ * \retval -ENOMEM if no memory could be allocated for the cached attr
+ * \retval -EPROTO if duplicate xattr is being added
+ */
+static int ll_xattr_cache_add(struct list_head *cache,
+ const char *xattr_name,
+ const char *xattr_val,
+ unsigned xattr_val_len)
+{
+ struct ll_xattr_entry *xattr;
+
+
+
+ if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+ CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name);
+ return -EPROTO;
+ }
+
+ OBD_SLAB_ALLOC_PTR_GFP(xattr, xattr_kmem, GFP_NOFS);
+ if (xattr == NULL) {
+ CDEBUG(D_CACHE, "failed to allocate xattr\n");
+ return -ENOMEM;
+ }
+
+ xattr->xe_name = kstrdup(xattr_name, GFP_NOFS);
+ if (!xattr->xe_name) {
+ CDEBUG(D_CACHE, "failed to alloc xattr name %u\n",
+ xattr->xe_namelen);
+ goto err_name;
+ }
+ xattr->xe_value = kmemdup(xattr_val, xattr_val_len, GFP_NOFS);
+ if (!xattr->xe_value)
+ goto err_value;
+
+ xattr->xe_vallen = xattr_val_len;
+ list_add(&xattr->xe_list, cache);
+
+ CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name,
+ xattr_val_len, xattr_val);
+
+ return 0;
+err_value:
+ OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+err_name:
+ OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+ return -ENOMEM;
+}
+
+/**
+ * This removes an extended attribute from cache.
+ *
+ * Remove @xattr_name attribute from @cache.
+ *
+ * \retval 0 success
+ * \retval -ENODATA if @xattr_name is not cached
+ */
+static int ll_xattr_cache_del(struct list_head *cache,
+ const char *xattr_name)
+{
+ struct ll_xattr_entry *xattr;
+
+
+
+ CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name);
+
+ if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) {
+ list_del(&xattr->xe_list);
+ OBD_FREE(xattr->xe_name, xattr->xe_namelen);
+ OBD_FREE(xattr->xe_value, xattr->xe_vallen);
+ OBD_SLAB_FREE_PTR(xattr, xattr_kmem);
+
+ return 0;
+ }
+
+ return -ENODATA;
+}
+
+/**
+ * This iterates cached extended attributes.
+ *
+ * Walk over cached attributes in @cache and
+ * fill in @xld_buffer or only calculate buffer
+ * size if @xld_buffer is NULL.
+ *
+ * \retval >= 0 buffer list size
+ * \retval -ENODATA if the list cannot fit @xld_size buffer
+ */
+static int ll_xattr_cache_list(struct list_head *cache,
+ char *xld_buffer,
+ int xld_size)
+{
+ struct ll_xattr_entry *xattr, *tmp;
+ int xld_tail = 0;
+
+
+
+ list_for_each_entry_safe(xattr, tmp, cache, xe_list) {
+ CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n",
+ xld_buffer, xld_tail, xattr->xe_name);
+
+ if (xld_buffer) {
+ xld_size -= xattr->xe_namelen;
+ if (xld_size < 0)
+ break;
+ memcpy(&xld_buffer[xld_tail],
+ xattr->xe_name, xattr->xe_namelen);
+ }
+ xld_tail += xattr->xe_namelen;
+ }
+
+ if (xld_size < 0)
+ return -ERANGE;
+
+ return xld_tail;
+}
+
+/**
+ * Check if the xattr cache is initialized (filled).
+ *
+ * \retval 0 @cache is not initialized
+ * \retval 1 @cache is initialized
+ */
+static int ll_xattr_cache_valid(struct ll_inode_info *lli)
+{
+ return !!(lli->lli_flags & LLIF_XATTR_CACHE);
+}
+
+/**
+ * This finalizes the xattr cache.
+ *
+ * Free all xattr memory. @lli is the inode info pointer.
+ *
+ * \retval 0 no error occurred
+ */
+static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli)
+{
+
+
+ if (!ll_xattr_cache_valid(lli))
+ return 0;
+
+ while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0)
+ ; /* empty loop */
+ lli->lli_flags &= ~LLIF_XATTR_CACHE;
+
+ return 0;
+}
+
+int ll_xattr_cache_destroy(struct inode *inode)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+ int rc;
+
+
+
+ down_write(&lli->lli_xattrs_list_rwsem);
+ rc = ll_xattr_cache_destroy_locked(lli);
+ up_write(&lli->lli_xattrs_list_rwsem);
+
+ return rc;
+}
+
+/**
+ * Match or enqueue a PR lock.
+ *
+ * Find or request an LDLM lock with xattr data.
+ * Since LDLM does not provide API for atomic match_or_enqueue,
+ * the function handles it with a separate enq lock.
+ * If successful, the function exits with the list lock held.
+ *
+ * \retval 0 no error occurred
+ * \retval -ENOMEM not enough memory
+ */
+static int ll_xattr_find_get_lock(struct inode *inode,
+ struct lookup_intent *oit,
+ struct ptlrpc_request **req)
+{
+ ldlm_mode_t mode;
+ struct lustre_handle lockh = { 0 };
+ struct md_op_data *op_data;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
+ .ei_mode = it_to_lock_mode(oit),
+ .ei_cb_bl = ll_md_blocking_ast,
+ .ei_cb_cp = ldlm_completion_ast };
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct obd_export *exp = sbi->ll_md_exp;
+ int rc;
+
+
+
+ mutex_lock(&lli->lli_xattrs_enq_lock);
+ /* inode may have been shrunk and recreated, so data is gone, match lock
+ * only when data exists. */
+ if (ll_xattr_cache_valid(lli)) {
+ /* Try matching first. */
+ mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0,
+ LCK_PR);
+ if (mode != 0) {
+ /* fake oit in mdc_revalidate_lock() manner */
+ oit->d.lustre.it_lock_handle = lockh.cookie;
+ oit->d.lustre.it_lock_mode = mode;
+ goto out;
+ }
+ }
+
+ /* Enqueue if the lock isn't cached locally. */
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data)) {
+ mutex_unlock(&lli->lli_xattrs_enq_lock);
+ return PTR_ERR(op_data);
+ }
+
+ op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS;
+
+ rc = md_enqueue(exp, &einfo, oit, op_data, &lockh, NULL, 0, NULL, 0);
+ ll_finish_md_op_data(op_data);
+
+ if (rc < 0) {
+ CDEBUG(D_CACHE,
+ "md_intent_lock failed with %d for fid "DFID"\n",
+ rc, PFID(ll_inode2fid(inode)));
+ mutex_unlock(&lli->lli_xattrs_enq_lock);
+ return rc;
+ }
+
+ *req = (struct ptlrpc_request *)oit->d.lustre.it_data;
+out:
+ down_write(&lli->lli_xattrs_list_rwsem);
+ mutex_unlock(&lli->lli_xattrs_enq_lock);
+
+ return 0;
+}
+
+/**
+ * Refill the xattr cache.
+ *
+ * Fetch and cache the whole of xattrs for @inode, acquiring
+ * a read or a write xattr lock depending on operation in @oit.
+ * Intent is dropped on exit unless the operation is setxattr.
+ *
+ * \retval 0 no error occurred
+ * \retval -EPROTO network protocol error
+ * \retval -ENOMEM not enough memory for the cache
+ */
+static int ll_xattr_cache_refill(struct inode *inode, struct lookup_intent *oit)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ptlrpc_request *req = NULL;
+ const char *xdata, *xval, *xtail, *xvtail;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct mdt_body *body;
+ __u32 *xsizes;
+ int rc = 0, i;
+
+
+
+ rc = ll_xattr_find_get_lock(inode, oit, &req);
+ if (rc)
+ goto out_no_unlock;
+
+ /* Do we have the data at this point? */
+ if (ll_xattr_cache_valid(lli)) {
+ ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1);
+ rc = 0;
+ goto out_maybe_drop;
+ }
+
+ /* Matched but no cache? Cancelled on error by a parallel refill. */
+ if (unlikely(req == NULL)) {
+ CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n");
+ rc = -EIO;
+ goto out_maybe_drop;
+ }
+
+ if (oit->d.lustre.it_status < 0) {
+ CDEBUG(D_CACHE, "getxattr intent returned %d for fid "DFID"\n",
+ oit->d.lustre.it_status, PFID(ll_inode2fid(inode)));
+ rc = oit->d.lustre.it_status;
+ /* xattr data is so large that we don't want to cache it */
+ if (rc == -ERANGE)
+ rc = -EAGAIN;
+ goto out_destroy;
+ }
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL) {
+ CERROR("no MDT BODY in the refill xattr reply\n");
+ rc = -EPROTO;
+ goto out_destroy;
+ }
+ /* do not need swab xattr data */
+ xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+ body->eadatasize);
+ xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS,
+ body->aclsize);
+ xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS,
+ body->max_mdsize * sizeof(__u32));
+ if (xdata == NULL || xval == NULL || xsizes == NULL) {
+ CERROR("wrong setxattr reply\n");
+ rc = -EPROTO;
+ goto out_destroy;
+ }
+
+ xtail = xdata + body->eadatasize;
+ xvtail = xval + body->aclsize;
+
+ CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail);
+
+ ll_xattr_cache_init(lli);
+
+ for (i = 0; i < body->max_mdsize; i++) {
+ CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval);
+ /* Perform consistency checks: attr names and vals in pill */
+ if (memchr(xdata, 0, xtail - xdata) == NULL) {
+ CERROR("xattr protocol violation (names are broken)\n");
+ rc = -EPROTO;
+ } else if (xval + *xsizes > xvtail) {
+ CERROR("xattr protocol violation (vals are broken)\n");
+ rc = -EPROTO;
+ } else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) {
+ rc = -ENOMEM;
+ } else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) {
+ /* Filter out ACL ACCESS since it's cached separately */
+ CDEBUG(D_CACHE, "not caching %s\n",
+ XATTR_NAME_ACL_ACCESS);
+ rc = 0;
+ } else {
+ rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval,
+ *xsizes);
+ }
+ if (rc < 0) {
+ ll_xattr_cache_destroy_locked(lli);
+ goto out_destroy;
+ }
+ xdata += strlen(xdata) + 1;
+ xval += *xsizes;
+ xsizes++;
+ }
+
+ if (xdata != xtail || xval != xvtail)
+ CERROR("a hole in xattr data\n");
+
+ ll_set_lock_data(sbi->ll_md_exp, inode, oit, NULL);
+
+ goto out_maybe_drop;
+out_maybe_drop:
+
+ ll_intent_drop_lock(oit);
+
+ if (rc != 0)
+ up_write(&lli->lli_xattrs_list_rwsem);
+out_no_unlock:
+ ptlrpc_req_finished(req);
+
+ return rc;
+
+out_destroy:
+ up_write(&lli->lli_xattrs_list_rwsem);
+
+ ldlm_lock_decref_and_cancel((struct lustre_handle *)
+ &oit->d.lustre.it_lock_handle,
+ oit->d.lustre.it_lock_mode);
+
+ goto out_no_unlock;
+}
+
+/**
+ * Get an xattr value or list xattrs using the write-through cache.
+ *
+ * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or
+ * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode.
+ * The resulting value/list is stored in @buffer if the former
+ * is not larger than @size.
+ *
+ * \retval 0 no error occurred
+ * \retval -EPROTO network protocol error
+ * \retval -ENOMEM not enough memory for the cache
+ * \retval -ERANGE the buffer is not large enough
+ * \retval -ENODATA no such attr or the list is empty
+ */
+int ll_xattr_cache_get(struct inode *inode,
+ const char *name,
+ char *buffer,
+ size_t size,
+ __u64 valid)
+{
+ struct lookup_intent oit = { .it_op = IT_GETXATTR };
+ struct ll_inode_info *lli = ll_i2info(inode);
+ int rc = 0;
+
+
+
+ LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS));
+
+ down_read(&lli->lli_xattrs_list_rwsem);
+ if (!ll_xattr_cache_valid(lli)) {
+ up_read(&lli->lli_xattrs_list_rwsem);
+ rc = ll_xattr_cache_refill(inode, &oit);
+ if (rc)
+ return rc;
+ downgrade_write(&lli->lli_xattrs_list_rwsem);
+ } else {
+ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1);
+ }
+
+ if (valid & OBD_MD_FLXATTR) {
+ struct ll_xattr_entry *xattr;
+
+ rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr);
+ if (rc == 0) {
+ rc = xattr->xe_vallen;
+ /* zero size means we are only requested size in rc */
+ if (size != 0) {
+ if (size >= xattr->xe_vallen)
+ memcpy(buffer, xattr->xe_value,
+ xattr->xe_vallen);
+ else
+ rc = -ERANGE;
+ }
+ }
+ } else if (valid & OBD_MD_FLXATTRLS) {
+ rc = ll_xattr_cache_list(&lli->lli_xattrs,
+ size ? buffer : NULL, size);
+ }
+
+ goto out;
+out:
+ up_read(&lli->lli_xattrs_list_rwsem);
+
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lmv/Makefile b/drivers/staging/lustre/lustre/lmv/Makefile
new file mode 100644
index 000000000..a7a15369a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += lmv.o
+lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o
+lmv-$(CONFIG_PROC_FS) += lproc_lmv.o
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c
new file mode 100644
index 000000000..ee235926f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_fld.c
@@ -0,0 +1,83 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+
+#include "../include/obd_support.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv,
+ const struct lu_fid *fid,
+ u32 *mds)
+{
+ int rc;
+
+ /* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+ * this fid_is_local check should be removed once LU-2240 is fixed */
+ LASSERTF((fid_seq_in_fldb(fid_seq(fid)) ||
+ fid_seq_is_local_file(fid_seq(fid))) &&
+ fid_is_sane(fid), DFID" is insane!\n", PFID(fid));
+
+ rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+ LU_SEQ_RANGE_MDT, NULL);
+ if (rc) {
+ CERROR("Error while looking for mds number. Seq %#llx, err = %d\n",
+ fid_seq(fid), rc);
+ return rc;
+ }
+
+ CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+ *mds, PFID(fid));
+
+ if (*mds >= lmv->desc.ld_tgt_count) {
+ CERROR("FLD lookup got invalid mds #%x (max: %x) for fid=" DFID "\n", *mds, lmv->desc.ld_tgt_count,
+ PFID(fid));
+ rc = -EINVAL;
+ }
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c
new file mode 100644
index 000000000..d22d57b4f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_intent.c
@@ -0,0 +1,323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include "../include/lustre_intent.h"
+#include "../include/obd_support.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_dlm.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, void *lmm,
+ int lmmsize, struct lookup_intent *it,
+ const struct lu_fid *parent_fid, int flags,
+ struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct ptlrpc_request *req = NULL;
+ struct lustre_handle plock;
+ struct md_op_data *op_data;
+ struct lmv_tgt_desc *tgt;
+ struct mdt_body *body;
+ int pmode;
+ int rc = 0;
+
+ body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL)
+ return -EPROTO;
+
+ LASSERT((body->valid & OBD_MD_MDS));
+
+ /*
+ * Unfortunately, we have to lie to MDC/MDS to retrieve
+ * attributes llite needs and provideproper locking.
+ */
+ if (it->it_op & IT_LOOKUP)
+ it->it_op = IT_GETATTR;
+
+ /*
+ * We got LOOKUP lock, but we really need attrs.
+ */
+ pmode = it->d.lustre.it_lock_mode;
+ if (pmode) {
+ plock.cookie = it->d.lustre.it_lock_handle;
+ it->d.lustre.it_lock_mode = 0;
+ it->d.lustre.it_data = NULL;
+ }
+
+ LASSERT(fid_is_sane(&body->fid1));
+
+ tgt = lmv_find_target(lmv, &body->fid1);
+ if (IS_ERR(tgt)) {
+ rc = PTR_ERR(tgt);
+ goto out;
+ }
+
+ OBD_ALLOC_PTR(op_data);
+ if (op_data == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ op_data->op_fid1 = body->fid1;
+ /* Sent the parent FID to the remote MDT */
+ if (parent_fid != NULL) {
+ /* The parent fid is only for remote open to
+ * check whether the open is from OBF,
+ * see mdt_cross_open */
+ LASSERT(it->it_op & IT_OPEN);
+ op_data->op_fid2 = *parent_fid;
+ /* Add object FID to op_fid3, in case it needs to check stale
+ * (M_CHECK_STALE), see mdc_finish_intent_lock */
+ op_data->op_fid3 = body->fid1;
+ }
+
+ op_data->op_bias = MDS_CROSS_REF;
+ CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n",
+ PFID(&body->fid1), tgt->ltd_idx);
+
+ rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+ flags, &req, cb_blocking, extra_lock_flags);
+ if (rc)
+ goto out_free_op_data;
+
+ /*
+ * LLite needs LOOKUP lock to track dentry revocation in order to
+ * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+ * and put LOOKUP in request.
+ */
+ if (it->d.lustre.it_lock_mode != 0) {
+ it->d.lustre.it_remote_lock_handle =
+ it->d.lustre.it_lock_handle;
+ it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode;
+ }
+
+ it->d.lustre.it_lock_handle = plock.cookie;
+ it->d.lustre.it_lock_mode = pmode;
+
+out_free_op_data:
+ OBD_FREE_PTR(op_data);
+out:
+ if (rc && pmode)
+ ldlm_lock_decref(&plock, pmode);
+
+ ptlrpc_req_finished(*reqp);
+ *reqp = req;
+ return rc;
+}
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+ void *lmm, int lmmsize, struct lookup_intent *it,
+ int flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ struct mdt_body *body;
+ int rc;
+
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ /* If it is ready to open the file by FID, do not need
+ * allocate FID at all, otherwise it will confuse MDT */
+ if ((it->it_op & IT_CREAT) &&
+ !(it->it_flags & MDS_OPEN_BY_FID)) {
+ /*
+ * For open with IT_CREATE and for IT_CREATE cases allocate new
+ * fid and setup FLD for it.
+ */
+ op_data->op_fid3 = op_data->op_fid2;
+ rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+ if (rc != 0)
+ return rc;
+ }
+
+ CDEBUG(D_INODE, "OPEN_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%d\n",
+ PFID(&op_data->op_fid1),
+ PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+
+ rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags,
+ reqp, cb_blocking, extra_lock_flags);
+ if (rc != 0)
+ return rc;
+ /*
+ * Nothing is found, do not access body->fid1 as it is zero and thus
+ * pointless.
+ */
+ if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) &&
+ !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) &&
+ !(it->d.lustre.it_disposition & DISP_OPEN_OPEN))
+ return rc;
+
+ body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL)
+ return -EPROTO;
+ /*
+ * Not cross-ref case, just get out of here.
+ */
+ if (likely(!(body->valid & OBD_MD_MDS)))
+ return 0;
+
+ /*
+ * Okay, MDS has returned success. Probably name has been resolved in
+ * remote inode.
+ */
+ rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags,
+ reqp, cb_blocking, extra_lock_flags);
+ if (rc != 0) {
+ LASSERT(rc < 0);
+ /*
+ * This is possible, that some userspace application will try to
+ * open file as directory and we will have -ENOTDIR here. As
+ * this is normal situation, we should not print error here,
+ * only debug info.
+ */
+ CDEBUG(D_INODE, "Can't handle remote %s: dir " DFID "(" DFID "):%*s: %d\n",
+ LL_IT2STR(it), PFID(&op_data->op_fid2),
+ PFID(&op_data->op_fid1), op_data->op_namelen,
+ op_data->op_name, rc);
+ return rc;
+ }
+
+ return rc;
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+ void *lmm, int lmmsize, struct lookup_intent *it,
+ int flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt = NULL;
+ struct mdt_body *body;
+ int rc = 0;
+
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ if (!fid_is_sane(&op_data->op_fid2))
+ fid_zero(&op_data->op_fid2);
+
+ CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+ ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+ PFID(&op_data->op_fid2),
+ op_data->op_name ? op_data->op_name : "<NULL>",
+ tgt->ltd_idx);
+
+ op_data->op_bias &= ~MDS_CROSS_REF;
+
+ rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+ flags, reqp, cb_blocking, extra_lock_flags);
+
+ if (rc < 0 || *reqp == NULL)
+ return rc;
+
+ /*
+ * MDS has returned success. Probably name has been resolved in
+ * remote inode. Let's check this.
+ */
+ body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL)
+ return -EPROTO;
+ /* Not cross-ref case, just get out of here. */
+ if (likely(!(body->valid & OBD_MD_MDS)))
+ return 0;
+
+ rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp,
+ cb_blocking, extra_lock_flags);
+
+ return rc;
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+ void *lmm, int lmmsize, struct lookup_intent *it,
+ int flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags)
+{
+ struct obd_device *obd = exp->exp_obd;
+ int rc;
+
+ LASSERT(it != NULL);
+ LASSERT(fid_is_sane(&op_data->op_fid1));
+
+ CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n",
+ LL_IT2STR(it), op_data->op_namelen, op_data->op_name,
+ PFID(&op_data->op_fid1));
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT))
+ rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it,
+ flags, reqp, cb_blocking,
+ extra_lock_flags);
+ else if (it->it_op & IT_OPEN)
+ rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it,
+ flags, reqp, cb_blocking,
+ extra_lock_flags);
+ else
+ LBUG();
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
new file mode 100644
index 000000000..852d78721
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
@@ -0,0 +1,157 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include "../include/lustre/lustre_idl.h"
+#include "../include/obd.h"
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define lmv_init_lock(lmv) mutex_lock(&lmv->init_mutex)
+#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex)
+
+#define LL_IT2STR(it) \
+ ((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_check_connect(struct obd_device *obd);
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+ void *lmm, int lmmsize, struct lookup_intent *it,
+ int flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags);
+
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+ void *lmm, int lmmsize, struct lookup_intent *it,
+ int flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags);
+
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+ void *lmm, int lmmsize, struct lookup_intent *it,
+ int flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+ void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds);
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds);
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+ struct md_op_data *op_data);
+
+static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req)
+{
+ struct mdt_body *body;
+ struct lmv_stripe_md *mea;
+
+ LASSERT(req != NULL);
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+ if (!body || !S_ISDIR(body->mode) || !body->eadatasize)
+ return NULL;
+
+ mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD,
+ body->eadatasize);
+ LASSERT(mea != NULL);
+
+ if (mea->mea_count == 0)
+ return NULL;
+ if (mea->mea_magic != MEA_MAGIC_LAST_CHAR &&
+ mea->mea_magic != MEA_MAGIC_ALL_CHARS &&
+ mea->mea_magic != MEA_MAGIC_HASH_SEGMENT)
+ return NULL;
+
+ return mea;
+}
+
+static inline int lmv_get_easize(struct lmv_obd *lmv)
+{
+ return sizeof(struct lmv_stripe_md) +
+ lmv->desc.ld_tgt_count *
+ sizeof(struct lu_fid);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_get_target(struct lmv_obd *lmv, u32 mds)
+{
+ int count = lmv->desc.ld_tgt_count;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ if (lmv->tgts[i] == NULL)
+ continue;
+
+ if (lmv->tgts[i]->ltd_idx == mds)
+ return lmv->tgts[i];
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+ u32 mds = 0;
+ int rc;
+
+ if (lmv->desc.ld_tgt_count > 1) {
+ rc = lmv_fld_lookup(lmv, fid, &mds);
+ if (rc)
+ return ERR_PTR(rc);
+ }
+
+ return lmv_get_target(lmv, mds);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+ struct lu_fid *fid);
+/* lproc_lmv.c */
+#if defined(CONFIG_PROC_FS)
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+extern struct file_operations lmv_proc_target_fops;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
new file mode 100644
index 000000000..b9459faf8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
@@ -0,0 +1,2892 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/uaccess.h>
+
+#include "../include/lustre/lustre_idl.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_lite.h"
+#include "../include/lustre_fid.h"
+#include "lmv_internal.h"
+
+static void lmv_activate_target(struct lmv_obd *lmv,
+ struct lmv_tgt_desc *tgt,
+ int activate)
+{
+ if (tgt->ltd_active == activate)
+ return;
+
+ tgt->ltd_active = activate;
+ lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+}
+
+/**
+ * Error codes:
+ *
+ * -EINVAL : UUID can't be found in the LMV's target list
+ * -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ * -EBADF : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
+ int activate)
+{
+ struct lmv_tgt_desc *uninitialized_var(tgt);
+ struct obd_device *obd;
+ int i;
+ int rc = 0;
+
+ CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+ lmv, uuid->uuid, activate);
+
+ spin_lock(&lmv->lmv_lock);
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+ if (tgt == NULL || tgt->ltd_exp == NULL)
+ continue;
+
+ CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i,
+ tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+
+ if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+ break;
+ }
+
+ if (i == lmv->desc.ld_tgt_count) {
+ rc = -EINVAL;
+ goto out_lmv_lock;
+ }
+
+ obd = class_exp2obd(tgt->ltd_exp);
+ if (obd == NULL) {
+ rc = -ENOTCONN;
+ goto out_lmv_lock;
+ }
+
+ CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+ obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+ obd->obd_type->typ_name, i);
+ LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+ if (tgt->ltd_active == activate) {
+ CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+ activate ? "" : "in");
+ goto out_lmv_lock;
+ }
+
+ CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+ activate ? "" : "in");
+ lmv_activate_target(lmv, tgt, activate);
+
+ out_lmv_lock:
+ spin_unlock(&lmv->lmv_lock);
+ return rc;
+}
+
+static struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+ struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+ return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+ enum obd_notify_event ev, void *data)
+{
+ struct obd_connect_data *conn_data;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct obd_uuid *uuid;
+ int rc = 0;
+
+ if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+ CERROR("unexpected notification of %s %s!\n",
+ watched->obd_type->typ_name,
+ watched->obd_name);
+ return -EINVAL;
+ }
+
+ uuid = &watched->u.cli.cl_target_uuid;
+ if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+ /*
+ * Set MDC as active before notifying the observer, so the
+ * observer can use the MDC normally.
+ */
+ rc = lmv_set_mdc_active(lmv, uuid,
+ ev == OBD_NOTIFY_ACTIVE);
+ if (rc) {
+ CERROR("%sactivation of %s failed: %d\n",
+ ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+ uuid->uuid, rc);
+ return rc;
+ }
+ } else if (ev == OBD_NOTIFY_OCD) {
+ conn_data = &watched->u.cli.cl_import->imp_connect_data;
+ /*
+ * XXX: Make sure that ocd_connect_flags from all targets are
+ * the same. Otherwise one of MDTs runs wrong version or
+ * something like this. --umka
+ */
+ obd->obd_self_export->exp_connect_data = *conn_data;
+ }
+#if 0
+ else if (ev == OBD_NOTIFY_DISCON) {
+ /*
+ * For disconnect event, flush fld cache for failout MDS case.
+ */
+ fld_client_flush(&lmv->lmv_fld);
+ }
+#endif
+ /*
+ * Pass the notification up the chain.
+ */
+ if (obd->obd_observer)
+ rc = obd_notify(obd->obd_observer, watched, ev, data);
+
+ return rc;
+}
+
+/**
+ * This is fake connect function. Its purpose is to initialize lmv and say
+ * caller that everything is okay. Real connection will be performed later.
+ */
+static int lmv_connect(const struct lu_env *env,
+ struct obd_export **exp, struct obd_device *obd,
+ struct obd_uuid *cluuid, struct obd_connect_data *data,
+ void *localdata)
+{
+ struct proc_dir_entry *lmv_proc_dir;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lustre_handle conn = { 0 };
+ int rc = 0;
+
+ /*
+ * We don't want to actually do the underlying connections more than
+ * once, so keep track.
+ */
+ lmv->refcount++;
+ if (lmv->refcount > 1) {
+ *exp = NULL;
+ return 0;
+ }
+
+ rc = class_connect(&conn, obd, cluuid);
+ if (rc) {
+ CERROR("class_connection() returned %d\n", rc);
+ return rc;
+ }
+
+ *exp = class_conn2export(&conn);
+ class_export_get(*exp);
+
+ lmv->exp = *exp;
+ lmv->connected = 0;
+ lmv->cluuid = *cluuid;
+
+ if (data)
+ lmv->conn_data = *data;
+
+ if (obd->obd_proc_private != NULL) {
+ lmv_proc_dir = obd->obd_proc_private;
+ } else {
+ lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry,
+ NULL, NULL);
+ if (IS_ERR(lmv_proc_dir)) {
+ CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.",
+ obd->obd_type->typ_name, obd->obd_name);
+ lmv_proc_dir = NULL;
+ }
+ obd->obd_proc_private = lmv_proc_dir;
+ }
+
+ /*
+ * All real clients should perform actual connection right away, because
+ * it is possible, that LMV will not have opportunity to connect targets
+ * and MDC stuff will be called directly, for instance while reading
+ * ../mdc/../kbytesfree procfs file, etc.
+ */
+ if (data->ocd_connect_flags & OBD_CONNECT_REAL)
+ rc = lmv_check_connect(obd);
+
+ if (rc && lmv_proc_dir) {
+ lprocfs_remove(&lmv_proc_dir);
+ obd->obd_proc_private = NULL;
+ }
+
+ return rc;
+}
+
+static void lmv_set_timeouts(struct obd_device *obd)
+{
+ struct lmv_tgt_desc *tgt;
+ struct lmv_obd *lmv;
+ int i;
+
+ lmv = &obd->u.lmv;
+ if (lmv->server_timeout == 0)
+ return;
+
+ if (lmv->connected == 0)
+ return;
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+ if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+ continue;
+
+ obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
+ KEY_INTERMDS, 0, NULL, NULL);
+ }
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, int easize,
+ int def_easize, int cookiesize, int def_cookiesize)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ int i;
+ int rc = 0;
+ int change = 0;
+
+ if (lmv->max_easize < easize) {
+ lmv->max_easize = easize;
+ change = 1;
+ }
+ if (lmv->max_def_easize < def_easize) {
+ lmv->max_def_easize = def_easize;
+ change = 1;
+ }
+ if (lmv->max_cookiesize < cookiesize) {
+ lmv->max_cookiesize = cookiesize;
+ change = 1;
+ }
+ if (lmv->max_def_cookiesize < def_cookiesize) {
+ lmv->max_def_cookiesize = def_cookiesize;
+ change = 1;
+ }
+ if (change == 0)
+ return 0;
+
+ if (lmv->connected == 0)
+ return 0;
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL ||
+ lmv->tgts[i]->ltd_exp == NULL ||
+ lmv->tgts[i]->ltd_active == 0) {
+ CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+ continue;
+ }
+
+ rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+ cookiesize, def_cookiesize);
+ if (rc) {
+ CERROR("%s: obd_init_ea_size() failed on MDT target %d: rc = %d.\n",
+ obd->obd_name, i, rc);
+ break;
+ }
+ }
+ return rc;
+}
+
+#define MAX_STRING_SIZE 128
+
+static int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+ struct proc_dir_entry *lmv_proc_dir;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct obd_uuid *cluuid = &lmv->cluuid;
+ struct obd_uuid lmv_mdc_uuid = { "LMV_MDC_UUID" };
+ struct obd_device *mdc_obd;
+ struct obd_export *mdc_exp;
+ struct lu_fld_target target;
+ int rc;
+
+ mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+ &obd->obd_uuid);
+ if (!mdc_obd) {
+ CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+ return -EINVAL;
+ }
+
+ CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
+ mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+ tgt->ltd_uuid.uuid, obd->obd_uuid.uuid,
+ cluuid->uuid);
+
+ if (!mdc_obd->obd_set_up) {
+ CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+ return -EINVAL;
+ }
+
+ rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
+ &lmv->conn_data, NULL);
+ if (rc) {
+ CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+ return rc;
+ }
+
+ /*
+ * Init fid sequence client for this mdc and add new fld target.
+ */
+ rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+ if (rc)
+ return rc;
+
+ target.ft_srv = NULL;
+ target.ft_exp = mdc_exp;
+ target.ft_idx = tgt->ltd_idx;
+
+ fld_client_add_target(&lmv->lmv_fld, &target);
+
+ rc = obd_register_observer(mdc_obd, obd);
+ if (rc) {
+ obd_disconnect(mdc_exp);
+ CERROR("target %s register_observer error %d\n",
+ tgt->ltd_uuid.uuid, rc);
+ return rc;
+ }
+
+ if (obd->obd_observer) {
+ /*
+ * Tell the observer about the new target.
+ */
+ rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+ OBD_NOTIFY_ACTIVE,
+ (void *)(tgt - lmv->tgts[0]));
+ if (rc) {
+ obd_disconnect(mdc_exp);
+ return rc;
+ }
+ }
+
+ tgt->ltd_active = 1;
+ tgt->ltd_exp = mdc_exp;
+ lmv->desc.ld_active_tgt_count++;
+
+ md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize,
+ lmv->max_cookiesize, lmv->max_def_cookiesize);
+
+ CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+ mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+ atomic_read(&obd->obd_refcount));
+
+ lmv_proc_dir = obd->obd_proc_private;
+ if (lmv_proc_dir) {
+ struct proc_dir_entry *mdc_symlink;
+
+ LASSERT(mdc_obd->obd_type != NULL);
+ LASSERT(mdc_obd->obd_type->typ_name != NULL);
+ mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+ lmv_proc_dir,
+ "../../../%s/%s",
+ mdc_obd->obd_type->typ_name,
+ mdc_obd->obd_name);
+ if (mdc_symlink == NULL) {
+ CERROR("Could not register LMV target /proc/fs/lustre/%s/%s/target_obds/%s.",
+ obd->obd_type->typ_name, obd->obd_name,
+ mdc_obd->obd_name);
+ lprocfs_remove(&lmv_proc_dir);
+ obd->obd_proc_private = NULL;
+ }
+ }
+ return 0;
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, int index)
+{
+ if (lmv->tgts[index] == NULL)
+ return;
+
+ OBD_FREE_PTR(lmv->tgts[index]);
+ lmv->tgts[index] = NULL;
+ return;
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+ __u32 index, int gen)
+{
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc = 0;
+
+ CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+
+ lmv_init_lock(lmv);
+
+ if (lmv->desc.ld_tgt_count == 0) {
+ struct obd_device *mdc_obd;
+
+ mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+ &obd->obd_uuid);
+ if (!mdc_obd) {
+ lmv_init_unlock(lmv);
+ CERROR("%s: Target %s not attached: rc = %d\n",
+ obd->obd_name, uuidp->uuid, -EINVAL);
+ return -EINVAL;
+ }
+ }
+
+ if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
+ tgt = lmv->tgts[index];
+ CERROR("%s: UUID %s already assigned at LOV target index %d: rc = %d\n",
+ obd->obd_name,
+ obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
+ lmv_init_unlock(lmv);
+ return -EEXIST;
+ }
+
+ if (index >= lmv->tgts_size) {
+ /* We need to reallocate the lmv target array. */
+ struct lmv_tgt_desc **newtgts, **old = NULL;
+ __u32 newsize = 1;
+ __u32 oldsize = 0;
+
+ while (newsize < index + 1)
+ newsize <<= 1;
+ OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+ if (newtgts == NULL) {
+ lmv_init_unlock(lmv);
+ return -ENOMEM;
+ }
+
+ if (lmv->tgts_size) {
+ memcpy(newtgts, lmv->tgts,
+ sizeof(*newtgts) * lmv->tgts_size);
+ old = lmv->tgts;
+ oldsize = lmv->tgts_size;
+ }
+
+ lmv->tgts = newtgts;
+ lmv->tgts_size = newsize;
+ smp_rmb();
+ if (old)
+ OBD_FREE(old, sizeof(*old) * oldsize);
+
+ CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
+ lmv->tgts_size);
+ }
+
+ OBD_ALLOC_PTR(tgt);
+ if (!tgt) {
+ lmv_init_unlock(lmv);
+ return -ENOMEM;
+ }
+
+ mutex_init(&tgt->ltd_fid_mutex);
+ tgt->ltd_idx = index;
+ tgt->ltd_uuid = *uuidp;
+ tgt->ltd_active = 0;
+ lmv->tgts[index] = tgt;
+ if (index >= lmv->desc.ld_tgt_count)
+ lmv->desc.ld_tgt_count = index + 1;
+
+ if (lmv->connected) {
+ rc = lmv_connect_mdc(obd, tgt);
+ if (rc) {
+ spin_lock(&lmv->lmv_lock);
+ lmv->desc.ld_tgt_count--;
+ memset(tgt, 0, sizeof(*tgt));
+ spin_unlock(&lmv->lmv_lock);
+ } else {
+ int easize = sizeof(struct lmv_stripe_md) +
+ lmv->desc.ld_tgt_count * sizeof(struct lu_fid);
+ lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
+ }
+ }
+
+ lmv_init_unlock(lmv);
+ return rc;
+}
+
+int lmv_check_connect(struct obd_device *obd)
+{
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int i;
+ int rc;
+ int easize;
+
+ if (lmv->connected)
+ return 0;
+
+ lmv_init_lock(lmv);
+ if (lmv->connected) {
+ lmv_init_unlock(lmv);
+ return 0;
+ }
+
+ if (lmv->desc.ld_tgt_count == 0) {
+ lmv_init_unlock(lmv);
+ CERROR("%s: no targets configured.\n", obd->obd_name);
+ return -EINVAL;
+ }
+
+ CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+ lmv->cluuid.uuid, obd->obd_name);
+
+ LASSERT(lmv->tgts != NULL);
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+ if (tgt == NULL)
+ continue;
+ rc = lmv_connect_mdc(obd, tgt);
+ if (rc)
+ goto out_disc;
+ }
+
+ lmv_set_timeouts(obd);
+ class_export_put(lmv->exp);
+ lmv->connected = 1;
+ easize = lmv_get_easize(lmv);
+ lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
+ lmv_init_unlock(lmv);
+ return 0;
+
+ out_disc:
+ while (i-- > 0) {
+ int rc2;
+ tgt = lmv->tgts[i];
+ if (tgt == NULL)
+ continue;
+ tgt->ltd_active = 0;
+ if (tgt->ltd_exp) {
+ --lmv->desc.ld_active_tgt_count;
+ rc2 = obd_disconnect(tgt->ltd_exp);
+ if (rc2) {
+ CERROR("LMV target %s disconnect on MDC idx %d: error %d\n",
+ tgt->ltd_uuid.uuid, i, rc2);
+ }
+ }
+ }
+ class_disconnect(lmv->exp);
+ lmv_init_unlock(lmv);
+ return rc;
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+ struct proc_dir_entry *lmv_proc_dir;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct obd_device *mdc_obd;
+ int rc;
+
+ LASSERT(tgt != NULL);
+ LASSERT(obd != NULL);
+
+ mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+ if (mdc_obd) {
+ mdc_obd->obd_force = obd->obd_force;
+ mdc_obd->obd_fail = obd->obd_fail;
+ mdc_obd->obd_no_recov = obd->obd_no_recov;
+ }
+
+ lmv_proc_dir = obd->obd_proc_private;
+ if (lmv_proc_dir)
+ lprocfs_remove_proc_entry(mdc_obd->obd_name, lmv_proc_dir);
+
+ rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+ if (rc)
+ CERROR("Can't finalize fids factory\n");
+
+ CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+ tgt->ltd_exp->exp_obd->obd_name,
+ tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+ obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+ rc = obd_disconnect(tgt->ltd_exp);
+ if (rc) {
+ if (tgt->ltd_active) {
+ CERROR("Target %s disconnect error %d\n",
+ tgt->ltd_uuid.uuid, rc);
+ }
+ }
+
+ lmv_activate_target(lmv, tgt, 0);
+ tgt->ltd_exp = NULL;
+ return 0;
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obd->u.lmv;
+ int rc;
+ int i;
+
+ if (!lmv->tgts)
+ goto out_local;
+
+ /*
+ * Only disconnect the underlying layers on the final disconnect.
+ */
+ lmv->refcount--;
+ if (lmv->refcount != 0)
+ goto out_local;
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+ continue;
+
+ lmv_disconnect_mdc(obd, lmv->tgts[i]);
+ }
+
+ if (obd->obd_proc_private)
+ lprocfs_remove((struct proc_dir_entry **)&obd->obd_proc_private);
+ else
+ CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+ obd->obd_type->typ_name, obd->obd_name);
+
+out_local:
+ /*
+ * This is the case when no real connection is established by
+ * lmv_check_connect().
+ */
+ if (!lmv->connected)
+ class_export_put(exp);
+ rc = class_disconnect(exp);
+ if (lmv->refcount == 0)
+ lmv->connected = 0;
+ return rc;
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg)
+{
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obddev->u.lmv;
+ struct getinfo_fid2path *gf;
+ struct lmv_tgt_desc *tgt;
+ struct getinfo_fid2path *remote_gf = NULL;
+ int remote_gf_size = 0;
+ int rc;
+
+ gf = (struct getinfo_fid2path *)karg;
+ tgt = lmv_find_target(lmv, &gf->gf_fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+repeat_fid2path:
+ rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+ if (rc != 0 && rc != -EREMOTE)
+ goto out_fid2path;
+
+ /* If remote_gf != NULL, it means just building the
+ * path on the remote MDT, copy this path segment to gf */
+ if (remote_gf != NULL) {
+ struct getinfo_fid2path *ori_gf;
+ char *ptr;
+
+ ori_gf = (struct getinfo_fid2path *)karg;
+ if (strlen(ori_gf->gf_path) +
+ strlen(gf->gf_path) > ori_gf->gf_pathlen) {
+ rc = -EOVERFLOW;
+ goto out_fid2path;
+ }
+
+ ptr = ori_gf->gf_path;
+
+ memmove(ptr + strlen(gf->gf_path) + 1, ptr,
+ strlen(ori_gf->gf_path));
+
+ strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
+ ptr += strlen(gf->gf_path);
+ *ptr = '/';
+ }
+
+ CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n",
+ tgt->ltd_exp->exp_obd->obd_name,
+ gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+ gf->gf_linkno);
+
+ if (rc == 0)
+ goto out_fid2path;
+
+ /* sigh, has to go to another MDT to do path building further */
+ if (remote_gf == NULL) {
+ remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+ OBD_ALLOC(remote_gf, remote_gf_size);
+ if (remote_gf == NULL) {
+ rc = -ENOMEM;
+ goto out_fid2path;
+ }
+ remote_gf->gf_pathlen = PATH_MAX;
+ }
+
+ if (!fid_is_sane(&gf->gf_fid)) {
+ CERROR("%s: invalid FID "DFID": rc = %d\n",
+ tgt->ltd_exp->exp_obd->obd_name,
+ PFID(&gf->gf_fid), -EINVAL);
+ rc = -EINVAL;
+ goto out_fid2path;
+ }
+
+ tgt = lmv_find_target(lmv, &gf->gf_fid);
+ if (IS_ERR(tgt)) {
+ rc = -EINVAL;
+ goto out_fid2path;
+ }
+
+ remote_gf->gf_fid = gf->gf_fid;
+ remote_gf->gf_recno = -1;
+ remote_gf->gf_linkno = -1;
+ memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
+ gf = remote_gf;
+ goto repeat_fid2path;
+
+out_fid2path:
+ if (remote_gf != NULL)
+ OBD_FREE(remote_gf, remote_gf_size);
+ return rc;
+}
+
+static int lmv_hsm_req_count(struct lmv_obd *lmv,
+ const struct hsm_user_request *hur,
+ const struct lmv_tgt_desc *tgt_mds)
+{
+ int i, nr = 0;
+ struct lmv_tgt_desc *curr_tgt;
+
+ /* count how many requests must be sent to the given target */
+ for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
+ curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid);
+ if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid))
+ nr++;
+ }
+ return nr;
+}
+
+static void lmv_hsm_req_build(struct lmv_obd *lmv,
+ struct hsm_user_request *hur_in,
+ const struct lmv_tgt_desc *tgt_mds,
+ struct hsm_user_request *hur_out)
+{
+ int i, nr_out;
+ struct lmv_tgt_desc *curr_tgt;
+
+ /* build the hsm_user_request for the given target */
+ hur_out->hur_request = hur_in->hur_request;
+ nr_out = 0;
+ for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) {
+ curr_tgt = lmv_find_target(lmv,
+ &hur_in->hur_user_item[i].hui_fid);
+ if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) {
+ hur_out->hur_user_item[nr_out] =
+ hur_in->hur_user_item[i];
+ nr_out++;
+ }
+ }
+ hur_out->hur_request.hr_itemcount = nr_out;
+ memcpy(hur_data(hur_out), hur_data(hur_in),
+ hur_in->hur_request.hr_data_len);
+}
+
+static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, unsigned int cmd, int len,
+ struct lustre_kernelcomm *lk, void *uarg)
+{
+ int i, rc = 0;
+
+ /* unregister request (call from llapi_hsm_copytool_fini) */
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ /* best effort: try to clean as much as possible
+ * (continue on error) */
+ obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg);
+ }
+
+ /* Whatever the result, remove copytool from kuc groups.
+ * Unreached coordinators will get EPIPE on next requests
+ * and will unregister automatically.
+ */
+ rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
+ return rc;
+}
+
+static int lmv_hsm_ct_register(struct lmv_obd *lmv, unsigned int cmd, int len,
+ struct lustre_kernelcomm *lk, void *uarg)
+{
+ struct file *filp;
+ int i, j, err;
+ int rc = 0;
+ bool any_set = false;
+
+ /* All or nothing: try to register to all MDS.
+ * In case of failure, unregister from previous MDS,
+ * except if it because of inactive target. */
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp,
+ len, lk, uarg);
+ if (err) {
+ if (lmv->tgts[i]->ltd_active) {
+ /* permanent error */
+ CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n",
+ lmv->tgts[i]->ltd_uuid.uuid,
+ i, cmd, err);
+ rc = err;
+ lk->lk_flags |= LK_FLG_STOP;
+ /* unregister from previous MDS */
+ for (j = 0; j < i; j++)
+ obd_iocontrol(cmd,
+ lmv->tgts[j]->ltd_exp,
+ len, lk, uarg);
+ return rc;
+ }
+ /* else: transient error.
+ * kuc will register to the missing MDT
+ * when it is back */
+ } else {
+ any_set = true;
+ }
+ }
+
+ if (!any_set)
+ /* no registration done: return error */
+ return -ENOTCONN;
+
+ /* at least one registration done, with no failure */
+ filp = fget(lk->lk_wfd);
+ if (filp == NULL) {
+ return -EBADF;
+ }
+ rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group, lk->lk_data);
+ if (rc != 0 && filp != NULL)
+ fput(filp);
+ return rc;
+}
+
+
+
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+ int len, void *karg, void *uarg)
+{
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obddev->u.lmv;
+ int i = 0;
+ int rc = 0;
+ int set = 0;
+ int count = lmv->desc.ld_tgt_count;
+
+ if (count == 0)
+ return -ENOTTY;
+
+ switch (cmd) {
+ case IOC_OBD_STATFS: {
+ struct obd_ioctl_data *data = karg;
+ struct obd_device *mdc_obd;
+ struct obd_statfs stat_buf = {0};
+ __u32 index;
+
+ memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+ if (index >= count)
+ return -ENODEV;
+
+ if (lmv->tgts[index] == NULL ||
+ lmv->tgts[index]->ltd_active == 0)
+ return -ENODATA;
+
+ mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp);
+ if (!mdc_obd)
+ return -EINVAL;
+
+ /* copy UUID */
+ if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+ min((int) data->ioc_plen2,
+ (int) sizeof(struct obd_uuid))))
+ return -EFAULT;
+
+ rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ 0);
+ if (rc)
+ return rc;
+ if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+ min((int) data->ioc_plen1,
+ (int) sizeof(stat_buf))))
+ return -EFAULT;
+ break;
+ }
+ case OBD_IOC_QUOTACTL: {
+ struct if_quotactl *qctl = karg;
+ struct lmv_tgt_desc *tgt = NULL;
+ struct obd_quotactl *oqctl;
+
+ if (qctl->qc_valid == QC_MDTIDX) {
+ if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+ return -EINVAL;
+
+ tgt = lmv->tgts[qctl->qc_idx];
+ if (tgt == NULL || tgt->ltd_exp == NULL)
+ return -EINVAL;
+ } else if (qctl->qc_valid == QC_UUID) {
+ for (i = 0; i < count; i++) {
+ tgt = lmv->tgts[i];
+ if (tgt == NULL)
+ continue;
+ if (!obd_uuid_equals(&tgt->ltd_uuid,
+ &qctl->obd_uuid))
+ continue;
+
+ if (tgt->ltd_exp == NULL)
+ return -EINVAL;
+
+ break;
+ }
+ } else {
+ return -EINVAL;
+ }
+
+ if (i >= count)
+ return -EAGAIN;
+
+ LASSERT(tgt && tgt->ltd_exp);
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl)
+ return -ENOMEM;
+
+ QCTL_COPY(oqctl, qctl);
+ rc = obd_quotactl(tgt->ltd_exp, oqctl);
+ if (rc == 0) {
+ QCTL_COPY(qctl, oqctl);
+ qctl->qc_valid = QC_MDTIDX;
+ qctl->obd_uuid = tgt->ltd_uuid;
+ }
+ OBD_FREE_PTR(oqctl);
+ break;
+ }
+ case OBD_IOC_CHANGELOG_SEND:
+ case OBD_IOC_CHANGELOG_CLEAR: {
+ struct ioc_changelog *icc = karg;
+
+ if (icc->icc_mdtindex >= count)
+ return -ENODEV;
+
+ if (lmv->tgts[icc->icc_mdtindex] == NULL ||
+ lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL ||
+ lmv->tgts[icc->icc_mdtindex]->ltd_active == 0)
+ return -ENODEV;
+ rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp,
+ sizeof(*icc), icc, NULL);
+ break;
+ }
+ case LL_IOC_GET_CONNECT_FLAGS: {
+ if (lmv->tgts[0] == NULL)
+ return -ENODATA;
+ rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
+ break;
+ }
+ case OBD_IOC_FID2PATH: {
+ rc = lmv_fid2path(exp, len, karg, uarg);
+ break;
+ }
+ case LL_IOC_HSM_STATE_GET:
+ case LL_IOC_HSM_STATE_SET:
+ case LL_IOC_HSM_ACTION: {
+ struct md_op_data *op_data = karg;
+ struct lmv_tgt_desc *tgt;
+
+ tgt = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ if (tgt->ltd_exp == NULL)
+ return -EINVAL;
+
+ rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+ break;
+ }
+ case LL_IOC_HSM_PROGRESS: {
+ const struct hsm_progress_kernel *hpk = karg;
+ struct lmv_tgt_desc *tgt;
+
+ tgt = lmv_find_target(lmv, &hpk->hpk_fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+ rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+ break;
+ }
+ case LL_IOC_HSM_REQUEST: {
+ struct hsm_user_request *hur = karg;
+ struct lmv_tgt_desc *tgt;
+ unsigned int reqcount = hur->hur_request.hr_itemcount;
+
+ if (reqcount == 0)
+ return 0;
+
+ /* if the request is about a single fid
+ * or if there is a single MDS, no need to split
+ * the request. */
+ if (reqcount == 1 || count == 1) {
+ tgt = lmv_find_target(lmv,
+ &hur->hur_user_item[0].hui_fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+ rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
+ } else {
+ /* split fid list to their respective MDS */
+ for (i = 0; i < count; i++) {
+ unsigned int nr, reqlen;
+ int rc1;
+ struct hsm_user_request *req;
+
+ nr = lmv_hsm_req_count(lmv, hur, lmv->tgts[i]);
+ if (nr == 0) /* nothing for this MDS */
+ continue;
+
+ /* build a request with fids for this MDS */
+ reqlen = offsetof(typeof(*hur),
+ hur_user_item[nr])
+ + hur->hur_request.hr_data_len;
+ OBD_ALLOC_LARGE(req, reqlen);
+ if (req == NULL)
+ return -ENOMEM;
+
+ lmv_hsm_req_build(lmv, hur, lmv->tgts[i], req);
+
+ rc1 = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp,
+ reqlen, req, uarg);
+ if (rc1 != 0 && rc == 0)
+ rc = rc1;
+ OBD_FREE_LARGE(req, reqlen);
+ }
+ }
+ break;
+ }
+ case LL_IOC_LOV_SWAP_LAYOUTS: {
+ struct md_op_data *op_data = karg;
+ struct lmv_tgt_desc *tgt1, *tgt2;
+
+ tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt1))
+ return PTR_ERR(tgt1);
+
+ tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+ if (IS_ERR(tgt2))
+ return PTR_ERR(tgt2);
+
+ if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+ return -EINVAL;
+
+ /* only files on same MDT can have their layouts swapped */
+ if (tgt1->ltd_idx != tgt2->ltd_idx)
+ return -EPERM;
+
+ rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+ break;
+ }
+ case LL_IOC_HSM_CT_START: {
+ struct lustre_kernelcomm *lk = karg;
+ if (lk->lk_flags & LK_FLG_STOP)
+ rc = lmv_hsm_ct_unregister(lmv, cmd, len, lk, uarg);
+ else
+ rc = lmv_hsm_ct_register(lmv, cmd, len, lk, uarg);
+ break;
+ }
+ default:
+ for (i = 0; i < count; i++) {
+ struct obd_device *mdc_obd;
+ int err;
+
+ if (lmv->tgts[i] == NULL ||
+ lmv->tgts[i]->ltd_exp == NULL)
+ continue;
+ /* ll_umount_begin() sets force flag but for lmv, not
+ * mdc. Let's pass it through */
+ mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp);
+ mdc_obd->obd_force = obddev->obd_force;
+ err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len,
+ karg, uarg);
+ if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+ return err;
+ } else if (err) {
+ if (lmv->tgts[i]->ltd_active) {
+ CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n",
+ lmv->tgts[i]->ltd_uuid.uuid,
+ i, cmd, err);
+ if (!rc)
+ rc = err;
+ }
+ } else
+ set = 1;
+ }
+ if (!set && !rc)
+ rc = -EIO;
+ }
+ return rc;
+}
+
+#if 0
+static int lmv_all_chars_policy(int count, const char *name,
+ int len)
+{
+ unsigned int c = 0;
+
+ while (len > 0)
+ c += name[--len];
+ c = c % count;
+ return c;
+}
+
+static int lmv_nid_policy(struct lmv_obd *lmv)
+{
+ struct obd_import *imp;
+ __u32 id;
+
+ /*
+ * XXX: To get nid we assume that underlying obd device is mdc.
+ */
+ imp = class_exp2cliimp(lmv->tgts[0].ltd_exp);
+ id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32);
+ return id % lmv->desc.ld_tgt_count;
+}
+
+static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+ enum placement_policy placement)
+{
+ switch (placement) {
+ case PLACEMENT_CHAR_POLICY:
+ return lmv_all_chars_policy(lmv->desc.ld_tgt_count,
+ op_data->op_name,
+ op_data->op_namelen);
+ case PLACEMENT_NID_POLICY:
+ return lmv_nid_policy(lmv);
+
+ default:
+ break;
+ }
+
+ CERROR("Unsupported placement policy %x\n", placement);
+ return -EINVAL;
+}
+#endif
+
+/**
+ * This is _inode_ placement policy function (not name).
+ */
+static int lmv_placement_policy(struct obd_device *obd,
+ struct md_op_data *op_data, u32 *mds)
+{
+ struct lmv_obd *lmv = &obd->u.lmv;
+
+ LASSERT(mds != NULL);
+
+ if (lmv->desc.ld_tgt_count == 1) {
+ *mds = 0;
+ return 0;
+ }
+
+ /**
+ * If stripe_offset is provided during setdirstripe
+ * (setdirstripe -i xx), xx MDS will be chosen.
+ */
+ if (op_data->op_cli_flags & CLI_SET_MEA) {
+ struct lmv_user_md *lum;
+
+ lum = (struct lmv_user_md *)op_data->op_data;
+ if (lum->lum_type == LMV_STRIPE_TYPE &&
+ lum->lum_stripe_offset != -1) {
+ if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
+ CERROR("%s: Stripe_offset %d > MDT count %d: rc = %d\n",
+ obd->obd_name,
+ lum->lum_stripe_offset,
+ lmv->desc.ld_tgt_count, -ERANGE);
+ return -ERANGE;
+ }
+ *mds = lum->lum_stripe_offset;
+ return 0;
+ }
+ }
+
+ /* Allocate new fid on target according to operation type and parent
+ * home mds. */
+ *mds = op_data->op_mds;
+ return 0;
+}
+
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
+{
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ tgt = lmv_get_target(lmv, mds);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ /*
+ * New seq alloc and FLD setup should be atomic. Otherwise we may find
+ * on server that seq in new allocated fid is not yet known.
+ */
+ mutex_lock(&tgt->ltd_fid_mutex);
+
+ if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ /*
+ * Asking underlaying tgt layer to allocate new fid.
+ */
+ rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
+ if (rc > 0) {
+ LASSERT(fid_is_sane(fid));
+ rc = 0;
+ }
+
+out:
+ mutex_unlock(&tgt->ltd_fid_mutex);
+ return rc;
+}
+
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+ struct md_op_data *op_data)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obd->u.lmv;
+ u32 mds = 0;
+ int rc;
+
+ LASSERT(op_data != NULL);
+ LASSERT(fid != NULL);
+
+ rc = lmv_placement_policy(obd, op_data, &mds);
+ if (rc) {
+ CERROR("Can't get target for allocating fid, rc %d\n",
+ rc);
+ return rc;
+ }
+
+ rc = __lmv_fid_alloc(lmv, fid, mds);
+ if (rc) {
+ CERROR("Can't alloc new fid, rc %d\n", rc);
+ return rc;
+ }
+
+ return rc;
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lprocfs_static_vars lvars;
+ struct lmv_desc *desc;
+ int rc;
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+ CERROR("LMV setup requires a descriptor\n");
+ return -EINVAL;
+ }
+
+ desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+ if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+ CERROR("Lmv descriptor size wrong: %d > %d\n",
+ (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+ return -EINVAL;
+ }
+
+ OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32);
+ if (lmv->tgts == NULL)
+ return -ENOMEM;
+ lmv->tgts_size = 32;
+
+ obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
+ lmv->desc.ld_tgt_count = 0;
+ lmv->desc.ld_active_tgt_count = 0;
+ lmv->max_cookiesize = 0;
+ lmv->max_def_easize = 0;
+ lmv->max_easize = 0;
+ lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
+
+ spin_lock_init(&lmv->lmv_lock);
+ mutex_init(&lmv->init_mutex);
+
+ lprocfs_lmv_init_vars(&lvars);
+
+ lprocfs_obd_setup(obd, lvars.obd_vars);
+#if defined (CONFIG_PROC_FS)
+ {
+ rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+ 0444, &lmv_proc_target_fops, obd);
+ if (rc)
+ CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+ obd->obd_name, rc);
+ }
+#endif
+ rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+ LUSTRE_CLI_FLD_HASH_DHT);
+ if (rc) {
+ CERROR("Can't init FLD, err %d\n", rc);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ return rc;
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+ struct lmv_obd *lmv = &obd->u.lmv;
+
+ fld_client_fini(&lmv->lmv_fld);
+ if (lmv->tgts != NULL) {
+ int i;
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL)
+ continue;
+ lmv_del_target(lmv, i);
+ }
+ OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+ lmv->tgts_size = 0;
+ }
+ return 0;
+}
+
+static int lmv_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+ struct lustre_cfg *lcfg = buf;
+ struct obd_uuid obd_uuid;
+ int gen;
+ __u32 index;
+ int rc;
+
+ switch (lcfg->lcfg_command) {
+ case LCFG_ADD_MDC:
+ /* modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID
+ * 2:0 3:1 4:lustre-MDT0000-mdc_UUID */
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1));
+
+ if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ rc = lmv_add_target(obd, &obd_uuid, index, gen);
+ goto out;
+ default:
+ CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+ rc = -EINVAL;
+ goto out;
+ }
+out:
+ return rc;
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct obd_statfs *temp;
+ int rc = 0;
+ int i;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ OBD_ALLOC(temp, sizeof(*temp));
+ if (temp == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+ continue;
+
+ rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+ max_age, flags);
+ if (rc) {
+ CERROR("can't stat MDS #%d (%s), error %d\n", i,
+ lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+ rc);
+ goto out_free_temp;
+ }
+
+ if (i == 0) {
+ *osfs = *temp;
+ /* If the statfs is from mount, it will needs
+ * retrieve necessary information from MDT0.
+ * i.e. mount does not need the merged osfs
+ * from all of MDT.
+ * And also clients can be mounted as long as
+ * MDT0 is in service*/
+ if (flags & OBD_STATFS_FOR_MDT0)
+ goto out_free_temp;
+ } else {
+ osfs->os_bavail += temp->os_bavail;
+ osfs->os_blocks += temp->os_blocks;
+ osfs->os_ffree += temp->os_ffree;
+ osfs->os_files += temp->os_files;
+ }
+ }
+
+out_free_temp:
+ OBD_FREE(temp, sizeof(*temp));
+ return rc;
+}
+
+static int lmv_getstatus(struct obd_export *exp,
+ struct lu_fid *fid,
+ struct obd_capa **pc)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc);
+ return rc;
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, u64 valid, const char *name,
+ const char *input, int input_size, int output_size,
+ int flags, struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+ input_size, output_size, flags, request);
+
+ return rc;
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, u64 valid, const char *name,
+ const char *input, int input_size, int output_size,
+ int flags, __u32 suppgid,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+ input_size, output_size, flags, suppgid,
+ request);
+
+ return rc;
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ if (op_data->op_flags & MF_GET_MDT_IDX) {
+ op_data->op_mds = tgt->ltd_idx;
+ return 0;
+ }
+
+ rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+ return rc;
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ int i;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+ /*
+ * With DNE every object can have two locks in different namespaces:
+ * lookup lock in space of MDT storing direntry and update/open lock in
+ * space of MDT storing inode.
+ */
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+ continue;
+ md_null_inode(lmv->tgts[i]->ltd_exp, fid);
+ }
+
+ return 0;
+}
+
+static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+ ldlm_iterator_t it, void *data)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ int i;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+ /*
+ * With DNE every object can have two locks in different namespaces:
+ * lookup lock in space of MDT storing direntry and update/open lock in
+ * space of MDT storing inode.
+ */
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+ continue;
+ rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
+ if (rc)
+ return rc;
+ }
+
+ return rc;
+}
+
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+ struct md_open_data *mod, struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+ rc = md_close(tgt->ltd_exp, op_data, mod, request);
+ return rc;
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+ struct lu_fid *fid)
+{
+ struct lmv_tgt_desc *tgt;
+
+ tgt = lmv_find_target(lmv, fid);
+ if (IS_ERR(tgt))
+ return tgt;
+
+ op_data->op_mds = tgt->ltd_idx;
+
+ return tgt;
+}
+
+static int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+ const void *data, int datalen, int mode, __u32 uid,
+ __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ if (!lmv->desc.ld_active_tgt_count)
+ return -EIO;
+
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
+ op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+ op_data->op_mds);
+
+ op_data->op_flags |= MF_MDC_CANCEL_FID1;
+ rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+ cap_effective, rdev, request);
+
+ if (rc == 0) {
+ if (*request == NULL)
+ return rc;
+ CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+ }
+ return rc;
+}
+
+static int lmv_done_writing(struct obd_export *exp,
+ struct md_op_data *op_data,
+ struct md_open_data *mod)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_done_writing(tgt->ltd_exp, op_data, mod);
+ return rc;
+}
+
+static int
+lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+ struct lookup_intent *it, struct md_op_data *op_data,
+ struct lustre_handle *lockh, void *lmm, int lmmsize,
+ __u64 extra_lock_flags)
+{
+ struct ptlrpc_request *req = it->d.lustre.it_data;
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lustre_handle plock;
+ struct lmv_tgt_desc *tgt;
+ struct md_op_data *rdata;
+ struct lu_fid fid1;
+ struct mdt_body *body;
+ int rc = 0;
+ int pmode;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body != NULL);
+
+ if (!(body->valid & OBD_MD_MDS))
+ return 0;
+
+ CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
+ LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
+
+ /*
+ * We got LOOKUP lock, but we really need attrs.
+ */
+ pmode = it->d.lustre.it_lock_mode;
+ LASSERT(pmode != 0);
+ memcpy(&plock, lockh, sizeof(plock));
+ it->d.lustre.it_lock_mode = 0;
+ it->d.lustre.it_data = NULL;
+ fid1 = body->fid1;
+
+ ptlrpc_req_finished(req);
+
+ tgt = lmv_find_target(lmv, &fid1);
+ if (IS_ERR(tgt)) {
+ rc = PTR_ERR(tgt);
+ goto out;
+ }
+
+ OBD_ALLOC_PTR(rdata);
+ if (rdata == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rdata->op_fid1 = fid1;
+ rdata->op_bias = MDS_CROSS_REF;
+
+ rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
+ lmm, lmmsize, NULL, extra_lock_flags);
+ OBD_FREE_PTR(rdata);
+out:
+ ldlm_lock_decref(&plock, pmode);
+ return rc;
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+ struct lookup_intent *it, struct md_op_data *op_data,
+ struct lustre_handle *lockh, void *lmm, int lmmsize,
+ struct ptlrpc_request **req, __u64 extra_lock_flags)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
+ LL_IT2STR(it), PFID(&op_data->op_fid1));
+
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
+ LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+ rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
+ lmm, lmmsize, req, extra_lock_flags);
+
+ if (rc == 0 && it && it->it_op == IT_OPEN) {
+ rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
+ lmm, lmmsize, extra_lock_flags);
+ }
+ return rc;
+}
+
+static int
+lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req = NULL;
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ struct mdt_body *body;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+ op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+ tgt->ltd_idx);
+
+ rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+ if (rc != 0)
+ return rc;
+
+ body = req_capsule_server_get(&(*request)->rq_pill,
+ &RMF_MDT_BODY);
+ LASSERT(body != NULL);
+
+ if (body->valid & OBD_MD_MDS) {
+ struct lu_fid rid = body->fid1;
+ CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+ PFID(&rid));
+
+ tgt = lmv_find_target(lmv, &rid);
+ if (IS_ERR(tgt)) {
+ ptlrpc_req_finished(*request);
+ return PTR_ERR(tgt);
+ }
+
+ op_data->op_fid1 = rid;
+ op_data->op_valid |= OBD_MD_FLCROSSREF;
+ op_data->op_namelen = 0;
+ op_data->op_name = NULL;
+ rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+ ptlrpc_req_finished(*request);
+ *request = req;
+ }
+
+ return rc;
+}
+
+#define md_op_data_fid(op_data, fl) \
+ (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+ fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+ fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+ fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+ NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
+ int op_tgt, ldlm_mode_t mode, int bits, int flag)
+{
+ struct lu_fid *fid = md_op_data_fid(op_data, flag);
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ ldlm_policy_data_t policy = {{0}};
+ int rc = 0;
+
+ if (!fid_is_sane(fid))
+ return 0;
+
+ tgt = lmv_find_target(lmv, fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ if (tgt->ltd_idx != op_tgt) {
+ CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+ policy.l_inodebits.bits = bits;
+ rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+ mode, LCF_ASYNC, NULL);
+ } else {
+ CDEBUG(D_INODE,
+ "EARLY_CANCEL skip operation target %d on "DFID"\n",
+ op_tgt, PFID(fid));
+ op_data->op_flags |= flag;
+ rc = 0;
+ }
+
+ return rc;
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ LASSERT(op_data->op_namelen != 0);
+
+ CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+ PFID(&op_data->op_fid2), op_data->op_namelen,
+ op_data->op_name, PFID(&op_data->op_fid1));
+
+ op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ op_data->op_cap = cfs_curproc_cap_pack();
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ /*
+ * Cancel UPDATE lock on child (fid1).
+ */
+ op_data->op_flags |= MF_MDC_CANCEL_FID2;
+ rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+ MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+ if (rc != 0)
+ return rc;
+
+ rc = md_link(tgt->ltd_exp, op_data, request);
+
+ return rc;
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+ const char *old, int oldlen, const char *new, int newlen,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *src_tgt;
+ struct lmv_tgt_desc *tgt_tgt;
+ int rc;
+
+ LASSERT(oldlen != 0);
+
+ CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
+ oldlen, old, PFID(&op_data->op_fid1),
+ newlen, new, PFID(&op_data->op_fid2));
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ op_data->op_cap = cfs_curproc_cap_pack();
+ src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+ if (IS_ERR(src_tgt))
+ return PTR_ERR(src_tgt);
+
+ tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+ if (IS_ERR(tgt_tgt))
+ return PTR_ERR(tgt_tgt);
+ /*
+ * LOOKUP lock on src child (fid3) should also be cancelled for
+ * src_tgt in mdc_rename.
+ */
+ op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+ /*
+ * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+ * own target.
+ */
+ rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+ LCK_EX, MDS_INODELOCK_UPDATE,
+ MF_MDC_CANCEL_FID2);
+
+ /*
+ * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
+ */
+ if (rc == 0) {
+ rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+ LCK_EX, MDS_INODELOCK_LOOKUP,
+ MF_MDC_CANCEL_FID4);
+ }
+
+ /*
+ * Cancel all the locks on tgt child (fid4).
+ */
+ if (rc == 0)
+ rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+ LCK_EX, MDS_INODELOCK_FULL,
+ MF_MDC_CANCEL_FID4);
+
+ if (rc == 0)
+ rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
+ new, newlen, request);
+ return rc;
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+ void *ea, int ealen, void *ea2, int ea2len,
+ struct ptlrpc_request **request,
+ struct md_open_data **mod)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc = 0;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+ PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+
+ op_data->op_flags |= MF_MDC_CANCEL_FID1;
+ tgt = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
+ ea2len, request, mod);
+
+ return rc;
+}
+
+static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_sync(tgt->ltd_exp, fid, oc, request);
+ return rc;
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ * ________
+ * | |
+ * .|--------v------- -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|-------------- -----' Each CFS_PAGE contains a single
+ * '------. lu_dirpage.
+ * .---------v------- -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '----------------- -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same CFS_PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .-------------------- -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v---------------- -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v---------------- -----| Here, each CFS_PAGE contains
+ * ... multiple lu_dirpages.
+ * |---v---------------- -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|---------------- -----'
+ * v
+ * .----------------------------.
+ * | next CFS_PAGE |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ * labeled 'next CFS_PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ * a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ * to the first entry of the next lu_dirpage.
+ */
+#if PAGE_CACHE_SIZE > LU_PAGE_SIZE
+static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
+{
+ int i;
+
+ for (i = 0; i < ncfspgs; i++) {
+ struct lu_dirpage *dp = kmap(pages[i]);
+ struct lu_dirpage *first = dp;
+ struct lu_dirent *end_dirent = NULL;
+ struct lu_dirent *ent;
+ __u64 hash_end = dp->ldp_hash_end;
+ __u32 flags = dp->ldp_flags;
+
+ while (--nlupgs > 0) {
+ ent = lu_dirent_start(dp);
+ for (end_dirent = ent; ent != NULL;
+ end_dirent = ent, ent = lu_dirent_next(ent));
+
+ /* Advance dp to next lu_dirpage. */
+ dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+ /* Check if we've reached the end of the CFS_PAGE. */
+ if (!((unsigned long)dp & ~CFS_PAGE_MASK))
+ break;
+
+ /* Save the hash and flags of this lu_dirpage. */
+ hash_end = dp->ldp_hash_end;
+ flags = dp->ldp_flags;
+
+ /* Check if lu_dirpage contains no entries. */
+ if (!end_dirent)
+ break;
+
+ /* Enlarge the end entry lde_reclen from 0 to
+ * first entry of next lu_dirpage. */
+ LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+ end_dirent->lde_reclen =
+ cpu_to_le16((char *)(dp->ldp_entries) -
+ (char *)end_dirent);
+ }
+
+ first->ldp_hash_end = hash_end;
+ first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+ first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+ kunmap(pages[i]);
+ }
+ LASSERTF(nlupgs == 0, "left = %d", nlupgs);
+}
+#else
+#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
+#endif /* PAGE_CACHE_SIZE > LU_PAGE_SIZE */
+
+static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
+ struct page **pages, struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ __u64 offset = op_data->op_offset;
+ int rc;
+ int ncfspgs; /* pages read in PAGE_CACHE_SIZE */
+ int nlupgs; /* pages read in LU_PAGE_SIZE */
+ struct lmv_tgt_desc *tgt;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INODE, "READPAGE at %#llx from "DFID"\n",
+ offset, PFID(&op_data->op_fid1));
+
+ tgt = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
+ if (rc != 0)
+ return rc;
+
+ ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
+ >> PAGE_CACHE_SHIFT;
+ nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+ LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+ LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
+
+ CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
+ op_data->op_npages);
+
+ lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
+
+ return rc;
+}
+
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt = NULL;
+ struct mdt_body *body;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+retry:
+ /* Send unlink requests to the MDT where the child is located */
+ if (likely(!fid_is_zero(&op_data->op_fid2)))
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+ else
+ tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ op_data->op_cap = cfs_curproc_cap_pack();
+
+ /*
+ * If child's fid is given, cancel unused locks for it if it is from
+ * another export than parent.
+ *
+ * LOOKUP lock for child (fid3) should also be cancelled on parent
+ * tgt_tgt in mdc_unlink().
+ */
+ op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+ /*
+ * Cancel FULL locks on child (fid3).
+ */
+ rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+ MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+
+ if (rc != 0)
+ return rc;
+
+ CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
+ PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+
+ rc = md_unlink(tgt->ltd_exp, op_data, request);
+ if (rc != 0 && rc != -EREMOTE)
+ return rc;
+
+ body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL)
+ return -EPROTO;
+
+ /* Not cross-ref case, just get out of here. */
+ if (likely(!(body->valid & OBD_MD_MDS)))
+ return 0;
+
+ CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+ exp->exp_obd->obd_name, PFID(&body->fid1));
+
+ /* This is a remote object, try remote MDT, Note: it may
+ * try more than 1 time here, Considering following case
+ * /mnt/lustre is root on MDT0, remote1 is on MDT1
+ * 1. Initially A does not know where remote1 is, it send
+ * unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+ * resend unlink RPC to MDT1 (retry 1st time).
+ *
+ * 2. During the unlink RPC in flight,
+ * client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+ * and create new remote1, but on MDT0
+ *
+ * 3. MDT1 get unlink RPC(from A), then do remote lock on
+ * /mnt/lustre, then lookup get fid of remote1, and find
+ * it is remote dir again, and replay -EREMOTE again.
+ *
+ * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+ *
+ * In theory, it might try unlimited time here, but it should
+ * be very rare case. */
+ op_data->op_fid2 = body->fid1;
+ ptlrpc_req_finished(*request);
+ *request = NULL;
+
+ goto retry;
+}
+
+static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+ struct lmv_obd *lmv = &obd->u.lmv;
+
+ switch (stage) {
+ case OBD_CLEANUP_EARLY:
+ /* XXX: here should be calling obd_precleanup() down to
+ * stack. */
+ break;
+ case OBD_CLEANUP_EXPORTS:
+ fld_client_proc_fini(&lmv->lmv_fld);
+ lprocfs_obd_cleanup(obd);
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+ __u32 keylen, void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm)
+{
+ struct obd_device *obd;
+ struct lmv_obd *lmv;
+ int rc = 0;
+
+ obd = class_exp2obd(exp);
+ if (obd == NULL) {
+ CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+ exp->exp_handle.h_cookie);
+ return -EINVAL;
+ }
+
+ lmv = &obd->u.lmv;
+ if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+ struct lmv_tgt_desc *tgt;
+ int i;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ LASSERT(*vallen == sizeof(__u32));
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+ /*
+ * All tgts should be connected when this gets called.
+ */
+ if (tgt == NULL || tgt->ltd_exp == NULL)
+ continue;
+
+ if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+ vallen, val, NULL))
+ return 0;
+ }
+ return -EINVAL;
+ } else if (KEY_IS(KEY_MAX_EASIZE) ||
+ KEY_IS(KEY_DEFAULT_EASIZE) ||
+ KEY_IS(KEY_MAX_COOKIESIZE) ||
+ KEY_IS(KEY_DEFAULT_COOKIESIZE) ||
+ KEY_IS(KEY_CONN_DATA)) {
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ /*
+ * Forwarding this request to first MDS, it should know LOV
+ * desc.
+ */
+ rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
+ vallen, val, NULL);
+ if (!rc && KEY_IS(KEY_CONN_DATA))
+ exp->exp_connect_data = *(struct obd_connect_data *)val;
+ return rc;
+ } else if (KEY_IS(KEY_TGT_COUNT)) {
+ *((int *)val) = lmv->desc.ld_tgt_count;
+ return 0;
+ }
+
+ CDEBUG(D_IOCTL, "Invalid key\n");
+ return -EINVAL;
+}
+
+static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, u32 vallen,
+ void *val, struct ptlrpc_request_set *set)
+{
+ struct lmv_tgt_desc *tgt;
+ struct obd_device *obd;
+ struct lmv_obd *lmv;
+ int rc = 0;
+
+ obd = class_exp2obd(exp);
+ if (obd == NULL) {
+ CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
+ exp->exp_handle.h_cookie);
+ return -EINVAL;
+ }
+ lmv = &obd->u.lmv;
+
+ if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
+ int i, err = 0;
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+
+ if (tgt == NULL || tgt->ltd_exp == NULL)
+ continue;
+
+ err = obd_set_info_async(env, tgt->ltd_exp,
+ keylen, key, vallen, val, set);
+ if (err && rc == 0)
+ rc = err;
+ }
+
+ return rc;
+ }
+
+ return -EINVAL;
+}
+
+static int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+ struct lov_stripe_md *lsm)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_stripe_md *meap;
+ struct lmv_stripe_md *lsmp;
+ int mea_size;
+ int i;
+
+ mea_size = lmv_get_easize(lmv);
+ if (!lmmp)
+ return mea_size;
+
+ if (*lmmp && !lsm) {
+ OBD_FREE_LARGE(*lmmp, mea_size);
+ *lmmp = NULL;
+ return 0;
+ }
+
+ if (*lmmp == NULL) {
+ OBD_ALLOC_LARGE(*lmmp, mea_size);
+ if (*lmmp == NULL)
+ return -ENOMEM;
+ }
+
+ if (!lsm)
+ return mea_size;
+
+ lsmp = (struct lmv_stripe_md *)lsm;
+ meap = (struct lmv_stripe_md *)*lmmp;
+
+ if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
+ lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
+ return -EINVAL;
+
+ meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
+ meap->mea_count = cpu_to_le32(lsmp->mea_count);
+ meap->mea_master = cpu_to_le32(lsmp->mea_master);
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ meap->mea_ids[i] = lsmp->mea_ids[i];
+ fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
+ }
+
+ return mea_size;
+}
+
+static int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+ struct lov_mds_md *lmm, int lmm_size)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lmv_stripe_md **tmea = (struct lmv_stripe_md **)lsmp;
+ struct lmv_stripe_md *mea = (struct lmv_stripe_md *)lmm;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ int mea_size;
+ int i;
+ __u32 magic;
+
+ mea_size = lmv_get_easize(lmv);
+ if (lsmp == NULL)
+ return mea_size;
+
+ if (*lsmp != NULL && lmm == NULL) {
+ OBD_FREE_LARGE(*tmea, mea_size);
+ *lsmp = NULL;
+ return 0;
+ }
+
+ LASSERT(mea_size == lmm_size);
+
+ OBD_ALLOC_LARGE(*tmea, mea_size);
+ if (*tmea == NULL)
+ return -ENOMEM;
+
+ if (!lmm)
+ return mea_size;
+
+ if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
+ mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
+ mea->mea_magic == MEA_MAGIC_HASH_SEGMENT) {
+ magic = le32_to_cpu(mea->mea_magic);
+ } else {
+ /*
+ * Old mea is not handled here.
+ */
+ CERROR("Old not supportable EA is found\n");
+ LBUG();
+ }
+
+ (*tmea)->mea_magic = magic;
+ (*tmea)->mea_count = le32_to_cpu(mea->mea_count);
+ (*tmea)->mea_master = le32_to_cpu(mea->mea_master);
+
+ for (i = 0; i < (*tmea)->mea_count; i++) {
+ (*tmea)->mea_ids[i] = mea->mea_ids[i];
+ fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
+ }
+ return mea_size;
+}
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags, void *opaque)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ int rc = 0;
+ int err;
+ int i;
+
+ LASSERT(fid != NULL);
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL ||
+ lmv->tgts[i]->ltd_active == 0)
+ continue;
+
+ err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid,
+ policy, mode, flags, opaque);
+ if (!rc)
+ rc = err;
+ }
+ return rc;
+}
+
+static int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+ __u64 *bits)
+{
+ struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+ int rc;
+
+ rc = md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits);
+ return rc;
+}
+
+static ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags,
+ const struct lu_fid *fid, ldlm_type_t type,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode,
+ struct lustre_handle *lockh)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ ldlm_mode_t rc;
+ int i;
+
+ CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+ /*
+ * With CMD every object can have two locks in different namespaces:
+ * lookup lock in space of mds storing direntry and update/open lock in
+ * space of mds storing inode. Thus we check all targets, not only that
+ * one fid was created in.
+ */
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ if (lmv->tgts[i] == NULL ||
+ lmv->tgts[i]->ltd_exp == NULL ||
+ lmv->tgts[i]->ltd_active == 0)
+ continue;
+
+ rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid,
+ type, policy, mode, lockh);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int lmv_get_lustre_md(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ struct obd_export *dt_exp,
+ struct obd_export *md_exp,
+ struct lustre_md *md)
+{
+ struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+ return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+}
+
+static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+
+ if (md->mea)
+ obd_free_memmd(exp, (void *)&md->mea);
+ return md_free_lustre_md(lmv->tgts[0]->ltd_exp, md);
+}
+
+static int lmv_set_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och,
+ struct lookup_intent *it)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+
+ tgt = lmv_find_target(lmv, &och->och_fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ return md_set_open_replay_data(tgt->ltd_exp, och, it);
+}
+
+static int lmv_clear_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+
+ tgt = lmv_find_target(lmv, &och->och_fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ return md_clear_open_replay_data(tgt->ltd_exp, och);
+}
+
+static int lmv_get_remote_perm(struct obd_export *exp,
+ const struct lu_fid *fid,
+ struct obd_capa *oc, __u32 suppgid,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request);
+ return rc;
+}
+
+static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+ renew_capa_cb_t cb)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_renew_capa(tgt->ltd_exp, oc, cb);
+ return rc;
+}
+
+static int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+ const struct req_msg_field *field,
+ struct obd_capa **oc)
+{
+ struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+ return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc);
+}
+
+static int lmv_intent_getattr_async(struct obd_export *exp,
+ struct md_enqueue_info *minfo,
+ struct ldlm_enqueue_info *einfo)
+{
+ struct md_op_data *op_data = &minfo->mi_data;
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt = NULL;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
+ return rc;
+}
+
+static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+ struct lu_fid *fid, __u64 *bits)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ rc = lmv_check_connect(obd);
+ if (rc)
+ return rc;
+
+ tgt = lmv_find_target(lmv, fid);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+ return rc;
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt = lmv->tgts[0];
+ int rc = 0, i;
+ __u64 curspace, curinodes;
+
+ if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) {
+ CERROR("master lmv inactive\n");
+ return -EIO;
+ }
+
+ if (oqctl->qc_cmd != Q_GETOQUOTA) {
+ rc = obd_quotactl(tgt->ltd_exp, oqctl);
+ return rc;
+ }
+
+ curspace = curinodes = 0;
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ int err;
+ tgt = lmv->tgts[i];
+
+ if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+ continue;
+ if (!tgt->ltd_active) {
+ CDEBUG(D_HA, "mdt %d is inactive.\n", i);
+ continue;
+ }
+
+ err = obd_quotactl(tgt->ltd_exp, oqctl);
+ if (err) {
+ CERROR("getquota on mdt %d failed. %d\n", i, err);
+ if (!rc)
+ rc = err;
+ } else {
+ curspace += oqctl->qc_dqblk.dqb_curspace;
+ curinodes += oqctl->qc_dqblk.dqb_curinodes;
+ }
+ }
+ oqctl->qc_dqblk.dqb_curspace = curspace;
+ oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+ return rc;
+}
+
+static int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int i, rc = 0;
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ int err;
+ tgt = lmv->tgts[i];
+ if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+ CERROR("lmv idx %d inactive\n", i);
+ return -EIO;
+ }
+
+ err = obd_quotacheck(tgt->ltd_exp, oqctl);
+ if (err && !rc)
+ rc = err;
+ }
+
+ return rc;
+}
+
+static struct obd_ops lmv_obd_ops = {
+ .o_owner = THIS_MODULE,
+ .o_setup = lmv_setup,
+ .o_cleanup = lmv_cleanup,
+ .o_precleanup = lmv_precleanup,
+ .o_process_config = lmv_process_config,
+ .o_connect = lmv_connect,
+ .o_disconnect = lmv_disconnect,
+ .o_statfs = lmv_statfs,
+ .o_get_info = lmv_get_info,
+ .o_set_info_async = lmv_set_info_async,
+ .o_packmd = lmv_packmd,
+ .o_unpackmd = lmv_unpackmd,
+ .o_notify = lmv_notify,
+ .o_get_uuid = lmv_get_uuid,
+ .o_iocontrol = lmv_iocontrol,
+ .o_quotacheck = lmv_quotacheck,
+ .o_quotactl = lmv_quotactl
+};
+
+static struct md_ops lmv_md_ops = {
+ .m_getstatus = lmv_getstatus,
+ .m_null_inode = lmv_null_inode,
+ .m_find_cbdata = lmv_find_cbdata,
+ .m_close = lmv_close,
+ .m_create = lmv_create,
+ .m_done_writing = lmv_done_writing,
+ .m_enqueue = lmv_enqueue,
+ .m_getattr = lmv_getattr,
+ .m_getxattr = lmv_getxattr,
+ .m_getattr_name = lmv_getattr_name,
+ .m_intent_lock = lmv_intent_lock,
+ .m_link = lmv_link,
+ .m_rename = lmv_rename,
+ .m_setattr = lmv_setattr,
+ .m_setxattr = lmv_setxattr,
+ .m_sync = lmv_sync,
+ .m_readpage = lmv_readpage,
+ .m_unlink = lmv_unlink,
+ .m_init_ea_size = lmv_init_ea_size,
+ .m_cancel_unused = lmv_cancel_unused,
+ .m_set_lock_data = lmv_set_lock_data,
+ .m_lock_match = lmv_lock_match,
+ .m_get_lustre_md = lmv_get_lustre_md,
+ .m_free_lustre_md = lmv_free_lustre_md,
+ .m_set_open_replay_data = lmv_set_open_replay_data,
+ .m_clear_open_replay_data = lmv_clear_open_replay_data,
+ .m_renew_capa = lmv_renew_capa,
+ .m_unpack_capa = lmv_unpack_capa,
+ .m_get_remote_perm = lmv_get_remote_perm,
+ .m_intent_getattr_async = lmv_intent_getattr_async,
+ .m_revalidate_lock = lmv_revalidate_lock
+};
+
+static int __init lmv_init(void)
+{
+ struct lprocfs_static_vars lvars;
+ int rc;
+
+ lprocfs_lmv_init_vars(&lvars);
+
+ rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
+ lvars.module_vars, LUSTRE_LMV_NAME, NULL);
+ return rc;
+}
+
+static void lmv_exit(void)
+{
+ class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);
diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
new file mode 100644
index 000000000..22e5c315f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
@@ -0,0 +1,237 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include "../include/lprocfs_status.h"
+#include "../include/obd_class.h"
+#include "lmv_internal.h"
+
+static int lmv_numobd_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lmv_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lmv.desc;
+ seq_printf(m, "%u\n", desc->ld_tgt_count);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_numobd);
+
+static const char *placement_name[] = {
+ [PLACEMENT_CHAR_POLICY] = "CHAR",
+ [PLACEMENT_NID_POLICY] = "NID",
+ [PLACEMENT_INVAL_POLICY] = "INVAL"
+};
+
+static enum placement_policy placement_name2policy(char *name, int len)
+{
+ int i;
+
+ for (i = 0; i < PLACEMENT_MAX_POLICY; i++) {
+ if (!strncmp(placement_name[i], name, len))
+ return i;
+ }
+ return PLACEMENT_INVAL_POLICY;
+}
+
+static const char *placement_policy2name(enum placement_policy placement)
+{
+ LASSERT(placement < PLACEMENT_MAX_POLICY);
+ return placement_name[placement];
+}
+
+static int lmv_placement_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lmv_obd *lmv;
+
+ LASSERT(dev != NULL);
+ lmv = &dev->u.lmv;
+ seq_printf(m, "%s\n", placement_policy2name(lmv->lmv_placement));
+ return 0;
+}
+
+#define MAX_POLICY_STRING_SIZE 64
+
+static ssize_t lmv_placement_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ char dummy[MAX_POLICY_STRING_SIZE + 1];
+ int len = count;
+ enum placement_policy policy;
+ struct lmv_obd *lmv;
+
+ if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE))
+ return -EFAULT;
+
+ LASSERT(dev != NULL);
+ lmv = &dev->u.lmv;
+
+ if (len > MAX_POLICY_STRING_SIZE)
+ len = MAX_POLICY_STRING_SIZE;
+
+ if (dummy[len - 1] == '\n')
+ len--;
+ dummy[len] = '\0';
+
+ policy = placement_name2policy(dummy, len);
+ if (policy != PLACEMENT_INVAL_POLICY) {
+ spin_lock(&lmv->lmv_lock);
+ lmv->lmv_placement = policy;
+ spin_unlock(&lmv->lmv_lock);
+ } else {
+ CERROR("Invalid placement policy \"%s\"!\n", dummy);
+ return -EINVAL;
+ }
+ return count;
+}
+LPROC_SEQ_FOPS(lmv_placement);
+
+static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lmv_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lmv.desc;
+ seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_activeobd);
+
+static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lmv_obd *lmv;
+
+ LASSERT(dev != NULL);
+ lmv = &dev->u.lmv;
+ seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
+
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+ struct obd_device *dev = p->private;
+ struct lmv_obd *lmv = &dev->u.lmv;
+ return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+ return;
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct obd_device *dev = p->private;
+ struct lmv_obd *lmv = &dev->u.lmv;
+ ++*pos;
+ return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+ struct lmv_tgt_desc *tgt = v;
+
+ if (tgt == NULL)
+ return 0;
+ seq_printf(p, "%d: %s %sACTIVE\n",
+ tgt->ltd_idx, tgt->ltd_uuid.uuid,
+ tgt->ltd_active ? "" : "IN");
+ return 0;
+}
+
+static struct seq_operations lmv_tgt_sops = {
+ .start = lmv_tgt_seq_start,
+ .stop = lmv_tgt_seq_stop,
+ .next = lmv_tgt_seq_next,
+ .show = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc;
+
+ rc = seq_open(file, &lmv_tgt_sops);
+ if (rc)
+ return rc;
+
+ seq = file->private_data;
+ seq->private = PDE_DATA(inode);
+
+ return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
+
+static struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+ { "numobd", &lmv_numobd_fops, NULL, 0 },
+ { "placement", &lmv_placement_fops, NULL, 0 },
+ { "activeobd", &lmv_activeobd_fops, NULL, 0 },
+ { "uuid", &lmv_uuid_fops, NULL, 0 },
+ { "desc_uuid", &lmv_desc_uuid_fops, NULL, 0 },
+ { NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, numrefs);
+
+static struct lprocfs_vars lprocfs_lmv_module_vars[] = {
+ { "num_refs", &lmv_numrefs_fops, NULL, 0 },
+ { NULL }
+};
+
+struct file_operations lmv_proc_target_fops = {
+ .owner = THIS_MODULE,
+ .open = lmv_target_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = lprocfs_lmv_module_vars;
+ lvars->obd_vars = lprocfs_lmv_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/lov/Makefile b/drivers/staging/lustre/lustre/lov/Makefile
new file mode 100644
index 000000000..6fe56a24b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTRE_FS) += lov.o
+lov-y := lov_obd.o lov_pack.o lov_offset.o lov_merge.o \
+ lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o \
+ lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o \
+ lovsub_lock.o lovsub_io.o lov_pool.o
+lov-$(CONFIG_PROC_FS) += lproc_lov.o
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
new file mode 100644
index 000000000..314ce8525
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
@@ -0,0 +1,839 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/cl_object.h"
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ * - top-object keeps a reference to its sub-objects, and destroys them
+ * when it is destroyed.
+ *
+ * - top-page keeps a reference to its sub-page, and destroys it when it
+ * is destroyed.
+ *
+ * - sub-lock keep a reference to its top-locks. Top-lock keeps a
+ * reference (and a hold, see cl_lock_hold()) on its sub-locks when it
+ * actively using them (that is, in cl_lock_state::CLS_QUEUING,
+ * cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When
+ * moving into cl_lock_state::CLS_CACHED state, top-lock releases a
+ * hold. From this moment top-lock has only a 'weak' reference to its
+ * sub-locks. This reference is protected by top-lock
+ * cl_lock::cll_guard, and will be automatically cleared by the sub-lock
+ * when the latter is destroyed. When a sub-lock is canceled, a
+ * reference to it is removed from the top-lock array, and top-lock is
+ * moved into CLS_NEW state. It is guaranteed that all sub-locks exist
+ * while their top-lock is in CLS_HELD or CLS_CACHED states.
+ *
+ * - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+ LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+/**
+ * Resources that are used in memory-cleaning path, and whose allocation
+ * cannot fail even when memory is tight. They are preallocated in sufficient
+ * quantities in lov_device::ld_emerg[], and access to them is serialized
+ * lov_device::ld_mutex.
+ */
+struct lov_device_emerg {
+ /**
+ * Page list used to submit IO when memory is in pressure.
+ */
+ struct cl_page_list emrg_page_list;
+ /**
+ * sub-io's shared by all threads accessing this device when memory is
+ * too low to allocate sub-io's dynamically.
+ */
+ struct cl_io emrg_subio;
+ /**
+ * Environments used by sub-io's in
+ * lov_device_emerg::emrg_subio.
+ */
+ struct lu_env *emrg_env;
+ /**
+ * Refchecks for lov_device_emerg::emrg_env.
+ *
+ * \see cl_env_get()
+ */
+ int emrg_refcheck;
+};
+
+struct lov_device {
+ /*
+ * XXX Locking of lov-private data is missing.
+ */
+ struct cl_device ld_cl;
+ struct lov_obd *ld_lov;
+ /** size of lov_device::ld_target[] array */
+ __u32 ld_target_nr;
+ struct lovsub_device **ld_target;
+ __u32 ld_flags;
+
+ /** Emergency resources used in memory-cleansing paths. */
+ struct lov_device_emerg **ld_emrg;
+ /**
+ * Serializes access to lov_device::ld_emrg in low-memory
+ * conditions.
+ */
+ struct mutex ld_mutex;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+ LLT_EMPTY, /** empty file without body (mknod + truncate) */
+ LLT_RAID0, /** striped file */
+ LLT_RELEASED, /** file with no objects (data in HSM) */
+ LLT_NR
+};
+
+static inline char *llt2str(enum lov_layout_type llt)
+{
+ switch (llt) {
+ case LLT_EMPTY:
+ return "EMPTY";
+ case LLT_RAID0:
+ return "RAID0";
+ case LLT_RELEASED:
+ return "RELEASED";
+ case LLT_NR:
+ LBUG();
+ }
+ LBUG();
+ return "";
+}
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+ struct cl_object lo_cl;
+ /**
+ * Serializes object operations with transitions between layout types.
+ *
+ * This semaphore is taken in shared mode by all object methods, and
+ * is taken in exclusive mode when object type is changed.
+ *
+ * \see lov_object::lo_type
+ */
+ struct rw_semaphore lo_type_guard;
+ /**
+ * Type of an object. Protected by lov_object::lo_type_guard.
+ */
+ enum lov_layout_type lo_type;
+ /**
+ * True if layout is invalid. This bit is cleared when layout lock
+ * is lost.
+ */
+ bool lo_layout_invalid;
+ /**
+ * How many IOs are on going on this object. Layout can be changed
+ * only if there is no active IO.
+ */
+ atomic_t lo_active_ios;
+ /**
+ * Waitq - wait for no one else is using lo_lsm
+ */
+ wait_queue_head_t lo_waitq;
+ /**
+ * Layout metadata. NULL if empty layout.
+ */
+ struct lov_stripe_md *lo_lsm;
+
+ union lov_layout_state {
+ struct lov_layout_raid0 {
+ unsigned lo_nr;
+ /**
+ * When this is true, lov_object::lo_attr contains
+ * valid up to date attributes for a top-level
+ * object. This field is reset to 0 when attributes of
+ * any sub-object change.
+ */
+ int lo_attr_valid;
+ /**
+ * Array of sub-objects. Allocated when top-object is
+ * created (lov_init_raid0()).
+ *
+ * Top-object is a strict master of its sub-objects:
+ * it is created before them, and outlives its
+ * children (this later is necessary so that basic
+ * functions like cl_object_top() always
+ * work). Top-object keeps a reference on every
+ * sub-object.
+ *
+ * When top-object is destroyed (lov_delete_raid0())
+ * it releases its reference to a sub-object and waits
+ * until the latter is finally destroyed.
+ */
+ struct lovsub_object **lo_sub;
+ /**
+ * protect lo_sub
+ */
+ spinlock_t lo_sub_lock;
+ /**
+ * Cached object attribute, built from sub-object
+ * attributes.
+ */
+ struct cl_attr lo_attr;
+ } raid0;
+ struct lov_layout_state_empty {
+ } empty;
+ struct lov_layout_state_released {
+ } released;
+ } u;
+ /**
+ * Thread that acquired lov_object::lo_type_guard in an exclusive
+ * mode.
+ */
+ struct task_struct *lo_owner;
+};
+
+/**
+ * Flags that top-lock can set on each of its sub-locks.
+ */
+enum lov_sub_flags {
+ /** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */
+ LSF_HELD = 1 << 0
+};
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+ /** sub-lock itself */
+ struct lovsub_lock *sub_lock;
+ /** An array of per-sub-lock flags, taken from enum lov_sub_flags */
+ unsigned sub_flags;
+ int sub_stripe;
+ struct cl_lock_descr sub_descr;
+ struct cl_lock_descr sub_got;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+ struct cl_lock_slice lls_cl;
+ /** Number of sub-locks in this lock */
+ int lls_nr;
+ /**
+ * Number of existing sub-locks.
+ */
+ unsigned lls_nr_filled;
+ /**
+ * Set when sub-lock was canceled, while top-lock was being
+ * used, or unused.
+ */
+ unsigned int lls_cancel_race:1;
+ /**
+ * An array of sub-locks
+ *
+ * There are two issues with managing sub-locks:
+ *
+ * - sub-locks are concurrently canceled, and
+ *
+ * - sub-locks are shared with other top-locks.
+ *
+ * To manage cancellation, top-lock acquires a hold on a sublock
+ * (lov_sublock_adopt()) when the latter is inserted into
+ * lov_lock::lls_sub[]. This hold is released (lov_sublock_release())
+ * when top-lock is going into CLS_CACHED state or destroyed. Hold
+ * prevents sub-lock from cancellation.
+ *
+ * Sub-lock sharing means, among other things, that top-lock that is
+ * in the process of creation (i.e., not yet inserted into lock list)
+ * is already accessible to other threads once at least one of its
+ * sub-locks is created, see lov_lock_sub_init().
+ *
+ * Sub-lock can be in one of the following states:
+ *
+ * - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such
+ * sub-lock was either never created (top-lock is in CLS_NEW
+ * state), or it was created, then canceled, then destroyed
+ * (lov_lock_unlink() cleared sub-lock pointer in the top-lock).
+ *
+ * - sub-lock exists and is on
+ * hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a
+ * normal state of a sub-lock in CLS_HELD and CLS_CACHED states
+ * of a top-lock.
+ *
+ * - sub-lock exists, but is not held by the top-lock. This
+ * happens after top-lock released a hold on sub-locks before
+ * going into cache (lov_lock_unuse()).
+ *
+ * \todo To support wide-striping, array has to be replaced with a set
+ * of queues to avoid scanning.
+ */
+ struct lov_lock_sub *lls_sub;
+ /**
+ * Original description with which lock was enqueued.
+ */
+ struct cl_lock_descr lls_orig;
+};
+
+struct lov_page {
+ struct cl_page_slice lps_cl;
+ int lps_invalid;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+ struct cl_device acid_cl;
+ struct lov_device *acid_super;
+ int acid_idx;
+ struct cl_device *acid_next;
+};
+
+struct lovsub_object {
+ struct cl_object_header lso_header;
+ struct cl_object lso_cl;
+ struct lov_object *lso_super;
+ int lso_index;
+};
+
+/**
+ * A link between a top-lock and a sub-lock. Separate data-structure is
+ * necessary, because top-locks and sub-locks are in M:N relationship.
+ *
+ * \todo This can be optimized for a (by far) most frequent case of a single
+ * top-lock per sub-lock.
+ */
+struct lov_lock_link {
+ struct lov_lock *lll_super;
+ /** An index within parent lock. */
+ int lll_idx;
+ /**
+ * A linkage into per sub-lock list of all corresponding top-locks,
+ * hanging off lovsub_lock::lss_parents.
+ */
+ struct list_head lll_list;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+ struct cl_lock_slice lss_cl;
+ /**
+ * List of top-locks that have given sub-lock as their part. Protected
+ * by cl_lock::cll_guard mutex.
+ */
+ struct list_head lss_parents;
+ /**
+ * Top-lock that initiated current operation on this sub-lock. This is
+ * only set during top-to-bottom lock operations like enqueue, and is
+ * used to optimize state change notification. Protected by
+ * cl_lock::cll_guard mutex.
+ *
+ * \see lovsub_lock_state_one().
+ */
+ struct cl_lock *lss_active;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+ const struct lu_env *lse_env;
+ struct cl_io *lse_io;
+ struct lov_io_sub *lse_sub;
+};
+
+struct lovsub_page {
+ struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+ struct cl_object_conf lti_stripe_conf;
+ struct lu_fid lti_fid;
+ struct cl_lock_descr lti_ldescr;
+ struct ost_lvb lti_lvb;
+ struct cl_2queue lti_cl2q;
+ struct cl_lock_closure lti_closure;
+ wait_queue_t lti_waiter;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+ int sub_stripe;
+ /**
+ * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+ * independently, with lov acting as a scheduler to maximize overall
+ * throughput.
+ */
+ struct cl_io *sub_io;
+ /**
+ * Linkage into a list (hanging off lov_io::lis_active) of all
+ * sub-io's active for the current IO iteration.
+ */
+ struct list_head sub_linkage;
+ /**
+ * true, iff cl_io_init() was successfully executed against
+ * lov_io_sub::sub_io.
+ */
+ int sub_io_initialized;
+ /**
+ * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't
+ * allocated, but borrowed from a per-device emergency pool.
+ */
+ int sub_borrowed;
+ /**
+ * environment, in which sub-io executes.
+ */
+ struct lu_env *sub_env;
+ /**
+ * environment's refcheck.
+ *
+ * \see cl_env_get()
+ */
+ int sub_refcheck;
+ int sub_refcheck2;
+ int sub_reenter;
+ void *sub_cookie;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+ /** super-class */
+ struct cl_io_slice lis_cl;
+ /**
+ * Pointer to the object slice. This is a duplicate of
+ * lov_io::lis_cl::cis_object.
+ */
+ struct lov_object *lis_object;
+ /**
+ * Original end-of-io position for this IO, set by the upper layer as
+ * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+ * changes pos and count to fit IO into a single stripe and uses saved
+ * value to determine when IO iterations have to stop.
+ *
+ * This is used only for CIT_READ and CIT_WRITE io's.
+ */
+ loff_t lis_io_endpos;
+
+ /**
+ * starting position within a file, for the current io loop iteration
+ * (stripe), used by ci_io_loop().
+ */
+ u64 lis_pos;
+ /**
+ * end position with in a file, for the current stripe io. This is
+ * exclusive (i.e., next offset after last byte affected by io).
+ */
+ u64 lis_endpos;
+
+ int lis_mem_frozen;
+ int lis_stripe_count;
+ int lis_active_subios;
+
+ /**
+ * the index of ls_single_subio in ls_subios array
+ */
+ int lis_single_subio_index;
+ struct cl_io lis_single_subio;
+
+ /**
+ * size of ls_subios array, actually the highest stripe #
+ */
+ int lis_nr_subios;
+ struct lov_io_sub *lis_subs;
+ /**
+ * List of active sub-io's.
+ */
+ struct list_head lis_active;
+};
+
+struct lov_session {
+ struct lov_io ls_io;
+ struct lov_sublock_env ls_subenv;
+};
+
+/**
+ * State of transfer for lov.
+ */
+struct lov_req {
+ struct cl_req_slice lr_cl;
+};
+
+/**
+ * State of transfer for lovsub.
+ */
+struct lovsub_req {
+ struct cl_req_slice lsrq_cl;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+extern struct kmem_cache *lov_req_kmem;
+
+extern struct kmem_cache *lovsub_lock_kmem;
+extern struct kmem_cache *lovsub_object_kmem;
+extern struct kmem_cache *lovsub_req_kmem;
+
+extern struct kmem_cache *lov_lock_link_kmem;
+
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf);
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf);
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io);
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io);
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io);
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io);
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io);
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io);
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io);
+int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io);
+void lov_lock_unlink(const struct lu_env *env, struct lov_lock_link *link,
+ struct lovsub_lock *sub);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+ int stripe);
+void lov_sub_put(struct lov_io_sub *sub);
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+ struct lovsub_lock *sublock,
+ const struct cl_lock_descr *d, int idx);
+
+
+int lov_page_init(const struct lu_env *env, struct cl_object *ob,
+ struct cl_page *page, struct page *vmpage);
+int lovsub_page_init(const struct lu_env *env, struct cl_object *ob,
+ struct cl_page *page, struct page *vmpage);
+
+int lov_page_init_empty(const struct lu_env *env,
+ struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage);
+int lov_page_init_raid0(const struct lu_env *env,
+ struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage);
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev);
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+ struct lov_lock *lck,
+ struct lovsub_lock *sub);
+struct lov_io_sub *lov_page_subio(const struct lu_env *env,
+ struct lov_io *lio,
+ const struct cl_page_slice *slice);
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm);
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+
+#define lov_foreach_target(lov, var) \
+ for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+ struct lov_session *ses;
+
+ ses = lu_context_key_get(env->le_ses, &lov_session_key);
+ LASSERT(ses != NULL);
+ return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+ return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+ return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+ return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+ return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+ LINVRNT(d->ld_type == &lov_device_type);
+ return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+ return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+ return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+ LINVRNT(d->ld_type == &lovsub_device_type);
+ return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+ LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+ return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+ return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+ return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+ LINVRNT(lov_is_object(obj));
+ return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+ LINVRNT(lov_is_object(&obj->co_lu));
+ return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+ return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+ return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+ LINVRNT(lovsub_is_object(&obj->co_lu));
+ return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+ LINVRNT(lovsub_is_object(obj));
+ return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+ LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+ return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+
+ slice = cl_lock_at(lock, &lovsub_device_type);
+ LASSERT(slice != NULL);
+ return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+ LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+ return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+ LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+ return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice)
+{
+ return container_of0(slice, struct lov_req, lr_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+ LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+ return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice)
+{
+ return container_of0(slice, struct lovsub_req, lsrq_cl);
+}
+
+static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice)
+{
+ return slice->cpl_page->cp_child;
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct lov_io *lio;
+
+ lio = container_of(ios, struct lov_io, lis_cl);
+ LASSERT(lio == lov_env_io(env));
+ return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+ return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+ struct lov_thread_info *info;
+
+ info = lu_context_key_get(&env->le_ctx, &lov_key);
+ LASSERT(info != NULL);
+ return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov)
+{
+ LASSERT(lov->lo_type == LLT_RAID0);
+ LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC ||
+ lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3);
+ return &lov->u.raid0;
+}
+
+/** @} lov */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c
new file mode 100644
index 000000000..711b837dd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_dev.c
@@ -0,0 +1,528 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include "../include/obd_class.h"
+
+#include "lov_cl_internal.h"
+#include "lov_internal.h"
+
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+struct kmem_cache *lov_req_kmem;
+
+struct kmem_cache *lovsub_lock_kmem;
+struct kmem_cache *lovsub_object_kmem;
+struct kmem_cache *lovsub_req_kmem;
+
+struct kmem_cache *lov_lock_link_kmem;
+
+/** Lock class of lov_device::ld_mutex. */
+static struct lock_class_key cl_lov_device_mutex_class;
+
+struct lu_kmem_descr lov_caches[] = {
+ {
+ .ckd_cache = &lov_lock_kmem,
+ .ckd_name = "lov_lock_kmem",
+ .ckd_size = sizeof(struct lov_lock)
+ },
+ {
+ .ckd_cache = &lov_object_kmem,
+ .ckd_name = "lov_object_kmem",
+ .ckd_size = sizeof(struct lov_object)
+ },
+ {
+ .ckd_cache = &lov_thread_kmem,
+ .ckd_name = "lov_thread_kmem",
+ .ckd_size = sizeof(struct lov_thread_info)
+ },
+ {
+ .ckd_cache = &lov_session_kmem,
+ .ckd_name = "lov_session_kmem",
+ .ckd_size = sizeof(struct lov_session)
+ },
+ {
+ .ckd_cache = &lov_req_kmem,
+ .ckd_name = "lov_req_kmem",
+ .ckd_size = sizeof(struct lov_req)
+ },
+ {
+ .ckd_cache = &lovsub_lock_kmem,
+ .ckd_name = "lovsub_lock_kmem",
+ .ckd_size = sizeof(struct lovsub_lock)
+ },
+ {
+ .ckd_cache = &lovsub_object_kmem,
+ .ckd_name = "lovsub_object_kmem",
+ .ckd_size = sizeof(struct lovsub_object)
+ },
+ {
+ .ckd_cache = &lovsub_req_kmem,
+ .ckd_name = "lovsub_req_kmem",
+ .ckd_size = sizeof(struct lovsub_req)
+ },
+ {
+ .ckd_cache = &lov_lock_link_kmem,
+ .ckd_name = "lov_lock_link_kmem",
+ .ckd_size = sizeof(struct lov_lock_link)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+/*****************************************************************************
+ *
+ * Lov transfer operations.
+ *
+ */
+
+static void lov_req_completion(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret)
+{
+ struct lov_req *lr;
+
+ lr = cl2lov_req(slice);
+ OBD_SLAB_FREE_PTR(lr, lov_req_kmem);
+}
+
+static const struct cl_req_operations lov_req_ops = {
+ .cro_completion = lov_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct lov_thread_info *info;
+
+ OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, GFP_NOFS);
+ if (info != NULL)
+ INIT_LIST_HEAD(&info->lti_closure.clc_list);
+ else
+ info = ERR_PTR(-ENOMEM);
+ return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct lov_thread_info *info = data;
+ LINVRNT(list_empty(&info->lti_closure.clc_list));
+ OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+ .lct_tags = LCT_CL_THREAD,
+ .lct_init = lov_key_init,
+ .lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct lov_session *info;
+
+ OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, GFP_NOFS);
+ if (info == NULL)
+ info = ERR_PTR(-ENOMEM);
+ return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct lov_session *info = data;
+ OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+ .lct_tags = LCT_SESSION,
+ .lct_init = lov_session_key_init,
+ .lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+ struct lu_device *d)
+{
+ int i;
+ struct lov_device *ld = lu2lov_dev(d);
+
+ LASSERT(ld->ld_lov != NULL);
+ if (ld->ld_target == NULL)
+ return NULL;
+
+ lov_foreach_target(ld, i) {
+ struct lovsub_device *lsd;
+
+ lsd = ld->ld_target[i];
+ if (lsd != NULL) {
+ cl_stack_fini(env, lovsub2cl_dev(lsd));
+ ld->ld_target[i] = NULL;
+ }
+ }
+ return NULL;
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+ const char *name, struct lu_device *next)
+{
+ struct lov_device *ld = lu2lov_dev(d);
+ int i;
+ int rc = 0;
+
+ LASSERT(d->ld_site != NULL);
+ if (ld->ld_target == NULL)
+ return rc;
+
+ lov_foreach_target(ld, i) {
+ struct lovsub_device *lsd;
+ struct cl_device *cl;
+ struct lov_tgt_desc *desc;
+
+ desc = ld->ld_lov->lov_tgts[i];
+ if (desc == NULL)
+ continue;
+
+ cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+ desc->ltd_obd->obd_lu_dev);
+ if (IS_ERR(cl)) {
+ rc = PTR_ERR(cl);
+ break;
+ }
+ lsd = cl2lovsub_dev(cl);
+ lsd->acid_idx = i;
+ lsd->acid_super = ld;
+ ld->ld_target[i] = lsd;
+ }
+
+ if (rc)
+ lov_device_fini(env, d);
+ else
+ ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+ return rc;
+}
+
+static int lov_req_init(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req)
+{
+ struct lov_req *lr;
+ int result;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, GFP_NOFS);
+ if (lr != NULL) {
+ cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops);
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+static const struct cl_device_operations lov_cl_ops = {
+ .cdo_req_init = lov_req_init
+};
+
+static void lov_emerg_free(struct lov_device_emerg **emrg, int nr)
+{
+ int i;
+
+ for (i = 0; i < nr; ++i) {
+ struct lov_device_emerg *em;
+
+ em = emrg[i];
+ if (em != NULL) {
+ LASSERT(em->emrg_page_list.pl_nr == 0);
+ if (em->emrg_env != NULL)
+ cl_env_put(em->emrg_env, &em->emrg_refcheck);
+ OBD_FREE_PTR(em);
+ }
+ }
+ OBD_FREE(emrg, nr * sizeof(emrg[0]));
+}
+
+static struct lu_device *lov_device_free(const struct lu_env *env,
+ struct lu_device *d)
+{
+ struct lov_device *ld = lu2lov_dev(d);
+ const int nr = ld->ld_target_nr;
+
+ cl_device_fini(lu2cl_dev(d));
+ if (ld->ld_target != NULL)
+ OBD_FREE(ld->ld_target, nr * sizeof(ld->ld_target[0]));
+ if (ld->ld_emrg != NULL)
+ lov_emerg_free(ld->ld_emrg, nr);
+ OBD_FREE_PTR(ld);
+ return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+ __u32 index)
+{
+ struct lov_device *ld = lu2lov_dev(dev);
+
+ if (ld->ld_target[index] != NULL) {
+ cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+ ld->ld_target[index] = NULL;
+ }
+}
+
+static struct lov_device_emerg **lov_emerg_alloc(int nr)
+{
+ struct lov_device_emerg **emerg;
+ int i;
+ int result;
+
+ OBD_ALLOC(emerg, nr * sizeof(emerg[0]));
+ if (emerg == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (result = i = 0; i < nr && result == 0; i++) {
+ struct lov_device_emerg *em;
+
+ OBD_ALLOC_PTR(em);
+ if (em != NULL) {
+ emerg[i] = em;
+ cl_page_list_init(&em->emrg_page_list);
+ em->emrg_env = cl_env_alloc(&em->emrg_refcheck,
+ LCT_REMEMBER|LCT_NOREF);
+ if (!IS_ERR(em->emrg_env))
+ em->emrg_env->le_ctx.lc_cookie = 0x2;
+ else {
+ result = PTR_ERR(em->emrg_env);
+ em->emrg_env = NULL;
+ }
+ } else
+ result = -ENOMEM;
+ }
+ if (result != 0) {
+ lov_emerg_free(emerg, nr);
+ emerg = ERR_PTR(result);
+ }
+ return emerg;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+ int result;
+ __u32 tgt_size;
+ __u32 sub_size;
+
+ result = 0;
+ tgt_size = dev->ld_lov->lov_tgt_size;
+ sub_size = dev->ld_target_nr;
+ if (sub_size < tgt_size) {
+ struct lovsub_device **newd;
+ struct lov_device_emerg **emerg;
+ const size_t sz = sizeof(newd[0]);
+
+ emerg = lov_emerg_alloc(tgt_size);
+ if (IS_ERR(emerg))
+ return PTR_ERR(emerg);
+
+ OBD_ALLOC(newd, tgt_size * sz);
+ if (newd != NULL) {
+ mutex_lock(&dev->ld_mutex);
+ if (sub_size > 0) {
+ memcpy(newd, dev->ld_target, sub_size * sz);
+ OBD_FREE(dev->ld_target, sub_size * sz);
+ }
+ dev->ld_target = newd;
+ dev->ld_target_nr = tgt_size;
+
+ if (dev->ld_emrg != NULL)
+ lov_emerg_free(dev->ld_emrg, sub_size);
+ dev->ld_emrg = emerg;
+ mutex_unlock(&dev->ld_mutex);
+ } else {
+ lov_emerg_free(emerg, tgt_size);
+ result = -ENOMEM;
+ }
+ }
+ return result;
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+ __u32 index)
+{
+ struct obd_device *obd = dev->ld_obd;
+ struct lov_device *ld = lu2lov_dev(dev);
+ struct lov_tgt_desc *tgt;
+ struct lovsub_device *lsd;
+ struct cl_device *cl;
+ int rc;
+
+ obd_getref(obd);
+
+ tgt = obd->u.lov.lov_tgts[index];
+ LASSERT(tgt != NULL);
+ LASSERT(tgt->ltd_obd != NULL);
+
+ if (!tgt->ltd_obd->obd_set_up) {
+ CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+ return -EINVAL;
+ }
+
+ rc = lov_expand_targets(env, ld);
+ if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+ LASSERT(dev->ld_site != NULL);
+
+ cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+ tgt->ltd_obd->obd_lu_dev);
+ if (!IS_ERR(cl)) {
+ lsd = cl2lovsub_dev(cl);
+ lsd->acid_idx = index;
+ lsd->acid_super = ld;
+ ld->ld_target[index] = lsd;
+ } else {
+ CERROR("add failed (%d), deleting %s\n", rc,
+ obd_uuid2str(&tgt->ltd_uuid));
+ lov_cl_del_target(env, dev, index);
+ rc = PTR_ERR(cl);
+ }
+ }
+ obd_putref(obd);
+ return rc;
+}
+
+static int lov_process_config(const struct lu_env *env,
+ struct lu_device *d, struct lustre_cfg *cfg)
+{
+ struct obd_device *obd = d->ld_obd;
+ int cmd;
+ int rc;
+ int gen;
+ __u32 index;
+
+ obd_getref(obd);
+
+ cmd = cfg->lcfg_command;
+ rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+ if (rc == 0) {
+ switch (cmd) {
+ case LCFG_LOV_ADD_OBD:
+ case LCFG_LOV_ADD_INA:
+ rc = lov_cl_add_target(env, d, index);
+ if (rc != 0)
+ lov_del_target(d->ld_obd, index, NULL, 0);
+ break;
+ case LCFG_LOV_DEL_OBD:
+ lov_cl_del_target(env, d, index);
+ break;
+ }
+ }
+ obd_putref(obd);
+ return rc;
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+ .ldo_object_alloc = lov_object_alloc,
+ .ldo_process_config = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg)
+{
+ struct lu_device *d;
+ struct lov_device *ld;
+ struct obd_device *obd;
+ int rc;
+
+ OBD_ALLOC_PTR(ld);
+ if (ld == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ cl_device_init(&ld->ld_cl, t);
+ d = lov2lu_dev(ld);
+ d->ld_ops = &lov_lu_ops;
+ ld->ld_cl.cd_ops = &lov_cl_ops;
+
+ mutex_init(&ld->ld_mutex);
+ lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class);
+
+ /* setup the LOV OBD */
+ obd = class_name2obd(lustre_cfg_string(cfg, 0));
+ LASSERT(obd != NULL);
+ rc = lov_setup(obd, cfg);
+ if (rc) {
+ lov_device_free(env, d);
+ return ERR_PTR(rc);
+ }
+
+ ld->ld_lov = &obd->u.lov;
+ return d;
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+ .ldto_init = lov_type_init,
+ .ldto_fini = lov_type_fini,
+
+ .ldto_start = lov_type_start,
+ .ldto_stop = lov_type_stop,
+
+ .ldto_device_alloc = lov_device_alloc,
+ .ldto_device_free = lov_device_free,
+
+ .ldto_device_init = lov_device_init,
+ .ldto_device_fini = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+ .ldt_tags = LU_DEVICE_CL,
+ .ldt_name = LUSTRE_LOV_NAME,
+ .ldt_ops = &lov_device_type_ops,
+ .ldt_ctx_tags = LCT_CL_THREAD
+};
+EXPORT_SYMBOL(lov_device_type);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c
new file mode 100644
index 000000000..2bcfaeaff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_ea.c
@@ -0,0 +1,363 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <asm/div64.h>
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_idl.h"
+
+#include "lov_internal.h"
+
+struct lovea_unpack_args {
+ struct lov_stripe_md *lsm;
+ int cursor;
+};
+
+static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
+ __u16 stripe_count)
+{
+ if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+ CERROR("bad stripe count %d\n", stripe_count);
+ lov_dump_lmm_common(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+ CERROR("zero object id\n");
+ lov_dump_lmm_common(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) {
+ CERROR("bad striping pattern\n");
+ lov_dump_lmm_common(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ if (lmm->lmm_stripe_size == 0 ||
+ (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+ CERROR("bad stripe size %u\n",
+ le32_to_cpu(lmm->lmm_stripe_size));
+ lov_dump_lmm_common(D_WARNING, lmm);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size)
+{
+ struct lov_stripe_md *lsm;
+ struct lov_oinfo *loi;
+ int i, oinfo_ptrs_size;
+
+ LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT);
+
+ oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count;
+ *size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size;
+
+ OBD_ALLOC_LARGE(lsm, *size);
+ if (!lsm)
+ return NULL;
+
+ for (i = 0; i < stripe_count; i++) {
+ OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+ if (loi == NULL)
+ goto err;
+ lsm->lsm_oinfo[i] = loi;
+ }
+ lsm->lsm_stripe_count = stripe_count;
+ return lsm;
+
+err:
+ while (--i >= 0)
+ OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi));
+ OBD_FREE_LARGE(lsm, *size);
+ return NULL;
+}
+
+void lsm_free_plain(struct lov_stripe_md *lsm)
+{
+ __u16 stripe_count = lsm->lsm_stripe_count;
+ int i;
+
+ for (i = 0; i < stripe_count; i++)
+ OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab,
+ sizeof(struct lov_oinfo));
+ OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) +
+ stripe_count * sizeof(struct lov_oinfo *));
+}
+
+static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmm)
+{
+ /*
+ * This supposes lov_mds_md_v1/v3 first fields are
+ * are the same
+ */
+ lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+ lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+ lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+ lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+ lsm->lsm_pool_name[0] = '\0';
+}
+
+static void
+lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
+ u64 *lov_off, u64 *swidth)
+{
+ if (swidth)
+ *swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
+ u64 *lov_off, u64 *swidth)
+{
+ if (swidth)
+ *swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
+ struct obd_export *md_exp)
+{
+ return 0;
+}
+
+/* Find minimum stripe maxbytes value. For inactive or
+ * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */
+static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes)
+{
+ struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import;
+
+ if (imp == NULL || !tgt->ltd_active) {
+ *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+ return;
+ }
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state == LUSTRE_IMP_FULL &&
+ (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+ imp->imp_connect_data.ocd_maxbytes > 0) {
+ if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes)
+ *stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+ } else {
+ *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+ }
+ spin_unlock(&imp->imp_lock);
+}
+
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
+ __u16 *stripe_count)
+{
+ if (lmm_bytes < sizeof(*lmm)) {
+ CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
+ lmm_bytes, (int)sizeof(*lmm));
+ return -EINVAL;
+ }
+
+ *stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+ if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
+ *stripe_count = 0;
+
+ if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+ CERROR("LOV EA V1 too small: %d, need %d\n",
+ lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
+ lov_dump_lmm_common(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
+}
+
+static int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md_v1 *lmm)
+{
+ struct lov_oinfo *loi;
+ int i;
+ int stripe_count;
+ __u64 stripe_maxbytes = OBD_OBJECT_EOF;
+
+ lsm_unpackmd_common(lsm, lmm);
+
+ stripe_count = lsm_is_released(lsm) ? 0 : lsm->lsm_stripe_count;
+
+ for (i = 0; i < stripe_count; i++) {
+ /* XXX LOV STACKING call down to osc_unpackmd() */
+ loi = lsm->lsm_oinfo[i];
+ ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+ loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+ loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+ CERROR("OST index %d more than OST count %d\n",
+ loi->loi_ost_idx, lov->desc.ld_tgt_count);
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+ if (!lov->lov_tgts[loi->loi_ost_idx]) {
+ CERROR("OST index %d missing\n", loi->loi_ost_idx);
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+ /* calculate the minimum stripe max bytes */
+ lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+ &stripe_maxbytes);
+ }
+
+ lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+ if (lsm->lsm_stripe_count == 0)
+ lsm->lsm_maxbytes = stripe_maxbytes * lov->desc.ld_tgt_count;
+
+ return 0;
+}
+
+const struct lsm_operations lsm_v1_ops = {
+ .lsm_free = lsm_free_plain,
+ .lsm_destroy = lsm_destroy_plain,
+ .lsm_stripe_by_index = lsm_stripe_by_index_plain,
+ .lsm_stripe_by_offset = lsm_stripe_by_offset_plain,
+ .lsm_lmm_verify = lsm_lmm_verify_v1,
+ .lsm_unpackmd = lsm_unpackmd_v1,
+};
+
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+ __u16 *stripe_count)
+{
+ struct lov_mds_md_v3 *lmm;
+
+ lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+ if (lmm_bytes < sizeof(*lmm)) {
+ CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+ lmm_bytes, (int)sizeof(*lmm));
+ return -EINVAL;
+ }
+
+ *stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+ if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
+ *stripe_count = 0;
+
+ if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+ CERROR("LOV EA V3 too small: %d, need %d\n",
+ lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+ lov_dump_lmm_common(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+ *stripe_count);
+}
+
+static int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmmv1)
+{
+ struct lov_mds_md_v3 *lmm;
+ struct lov_oinfo *loi;
+ int i;
+ int stripe_count;
+ __u64 stripe_maxbytes = OBD_OBJECT_EOF;
+ int cplen = 0;
+
+ lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+ lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+
+ stripe_count = lsm_is_released(lsm) ? 0 : lsm->lsm_stripe_count;
+
+ cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name,
+ sizeof(lsm->lsm_pool_name));
+ if (cplen >= sizeof(lsm->lsm_pool_name))
+ return -E2BIG;
+
+ for (i = 0; i < stripe_count; i++) {
+ /* XXX LOV STACKING call down to osc_unpackmd() */
+ loi = lsm->lsm_oinfo[i];
+ ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+ loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+ loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+ CERROR("OST index %d more than OST count %d\n",
+ loi->loi_ost_idx, lov->desc.ld_tgt_count);
+ lov_dump_lmm_v3(D_WARNING, lmm);
+ return -EINVAL;
+ }
+ if (!lov->lov_tgts[loi->loi_ost_idx]) {
+ CERROR("OST index %d missing\n", loi->loi_ost_idx);
+ lov_dump_lmm_v3(D_WARNING, lmm);
+ return -EINVAL;
+ }
+ /* calculate the minimum stripe max bytes */
+ lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+ &stripe_maxbytes);
+ }
+
+ lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+ if (lsm->lsm_stripe_count == 0)
+ lsm->lsm_maxbytes = stripe_maxbytes * lov->desc.ld_tgt_count;
+
+ return 0;
+}
+
+const struct lsm_operations lsm_v3_ops = {
+ .lsm_free = lsm_free_plain,
+ .lsm_destroy = lsm_destroy_plain,
+ .lsm_stripe_by_index = lsm_stripe_by_index_plain,
+ .lsm_stripe_by_offset = lsm_stripe_by_offset_plain,
+ .lsm_lmm_verify = lsm_lmm_verify_v3,
+ .lsm_unpackmd = lsm_unpackmd_v3,
+};
+
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm)
+{
+ CDEBUG(level, "lsm %p, objid " DOSTID ", maxbytes %#llx, magic 0x%08X, stripe_size %u, stripe_count %u, refc: %d, layout_gen %u, pool [" LOV_POOLNAMEF "]\n",
+ lsm,
+ POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+ lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+ atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+ lsm->lsm_pool_name);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h
new file mode 100644
index 000000000..b644acc9b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_internal.h
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_user.h"
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number. If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG == 64
+# define lov_do_div64(n, base) ({ \
+ uint64_t __base = (base); \
+ uint64_t __rem; \
+ __rem = ((uint64_t)(n)) % __base; \
+ (n) = ((uint64_t)(n)) / __base; \
+ __rem; \
+})
+#elif BITS_PER_LONG == 32
+# define lov_do_div64(n, base) ({ \
+ uint64_t __rem; \
+ if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \
+ int __remainder; \
+ LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+ "division %llu / %llu\n", (n), (uint64_t)(base)); \
+ __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1); \
+ (n) >>= LOV_MIN_STRIPE_BITS; \
+ __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS); \
+ __rem <<= LOV_MIN_STRIPE_BITS; \
+ __rem += __remainder; \
+ } else { \
+ __rem = do_div(n, base); \
+ } \
+ __rem; \
+})
+#endif
+
+struct lov_lock_handles {
+ struct portals_handle llh_handle;
+ atomic_t llh_refcount;
+ int llh_stripe_count;
+ struct lustre_handle llh_handles[0];
+};
+
+struct lov_request {
+ struct obd_info rq_oi;
+ struct lov_request_set *rq_rqset;
+
+ struct list_head rq_link;
+
+ int rq_idx; /* index in lov->tgts array */
+ int rq_stripe; /* stripe number */
+ int rq_complete;
+ int rq_rc;
+ int rq_buflen; /* length of sub_md */
+
+ u32 rq_oabufs;
+ u32 rq_pgaidx;
+};
+
+struct lov_request_set {
+ struct ldlm_enqueue_info *set_ei;
+ struct obd_info *set_oi;
+ atomic_t set_refcount;
+ struct obd_export *set_exp;
+ /* XXX: There is @set_exp already, however obd_statfs gets obd_device
+ only. */
+ struct obd_device *set_obd;
+ int set_count;
+ atomic_t set_completes;
+ atomic_t set_success;
+ atomic_t set_finish_checked;
+ struct llog_cookie *set_cookies;
+ int set_cookie_sent;
+ struct obd_trans_info *set_oti;
+ u32 set_oabufs;
+ struct brw_page *set_pga;
+ struct lov_lock_handles *set_lockh;
+ struct list_head set_list;
+ wait_queue_head_t set_waitq;
+ spinlock_t set_lock;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+void lov_finish_set(struct lov_request_set *set);
+
+static inline void lov_get_reqset(struct lov_request_set *set)
+{
+ LASSERT(set != NULL);
+ LASSERT(atomic_read(&set->set_refcount) > 0);
+ atomic_inc(&set->set_refcount);
+}
+
+static inline void lov_put_reqset(struct lov_request_set *set)
+{
+ if (atomic_dec_and_test(&set->set_refcount))
+ lov_finish_set(set);
+}
+
+static inline struct lov_lock_handles *
+lov_handle2llh(struct lustre_handle *handle)
+{
+ LASSERT(handle != NULL);
+ return class_handle2object(handle->cookie);
+}
+
+static inline void lov_llh_put(struct lov_lock_handles *llh)
+{
+ CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh,
+ atomic_read(&llh->llh_refcount) - 1);
+ LASSERT(atomic_read(&llh->llh_refcount) > 0 &&
+ atomic_read(&llh->llh_refcount) < 0x5a5a);
+ if (atomic_dec_and_test(&llh->llh_refcount)) {
+ class_handle_unhash(&llh->llh_handle);
+ /* The structure may be held by other threads because RCU.
+ * -jxiong */
+ if (atomic_read(&llh->llh_refcount))
+ return;
+
+ OBD_FREE_RCU(llh, sizeof(*llh) +
+ sizeof(*llh->llh_handles) * llh->llh_stripe_count,
+ &llh->llh_handle);
+ }
+}
+
+#define lov_uuid2str(lv, index) \
+ (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid,
+ struct lov_stripe_md *lsm, int stripeno, int *set);
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+ u64 size, int shrink);
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+ struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size,
+ int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off,
+ int stripeno, u64 *u64);
+u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size,
+ int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+ u64 start, u64 end,
+ u64 *obd_start, u64 *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off);
+
+/* lov_qos.c */
+#define LOV_USES_ASSIGNED_STRIPE 0
+#define LOV_USES_DEFAULT_STRIPE 1
+int qos_add_tgt(struct obd_device *obd, __u32 index);
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt);
+void qos_shrink_lsm(struct lov_request_set *set);
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
+void qos_update(struct lov_obd *lov);
+void qos_statfs_done(struct lov_obd *lov);
+void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait);
+int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
+
+/* lov_request.c */
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+int lov_set_finished(struct lov_request_set *set, int idempotent);
+void lov_update_set(struct lov_request_set *set,
+ struct lov_request *req, int rc);
+int lov_update_common_set(struct lov_request_set *set,
+ struct lov_request *req, int rc);
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx);
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_request_set **reqset);
+int lov_fini_getattr_set(struct lov_request_set *set);
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obdo *src_oa, struct lov_stripe_md *lsm,
+ struct obd_trans_info *oti,
+ struct lov_request_set **reqset);
+int lov_fini_destroy_set(struct lov_request_set *set);
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct lov_request_set **reqset);
+int lov_update_setattr_set(struct lov_request_set *set,
+ struct lov_request *req, int rc);
+int lov_fini_setattr_set(struct lov_request_set *set);
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+ struct lov_request_set **reqset);
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+ int success);
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+ int success);
+int lov_fini_statfs_set(struct lov_request_set *set);
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc);
+
+/* lov_obd.c */
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+ struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+ __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+ struct obd_uuid *uuidp, int gen);
+
+/* lov_pack.c */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm,
+ struct lov_stripe_md *lsm);
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+ struct lov_mds_md *lmm, int lmm_bytes);
+int lov_getstripe(struct obd_export *exp,
+ struct lov_stripe_md *lsm, struct lov_user_md *lump);
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+ int pattern, int magic);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm_common(int level, void *lmmp);
+void lov_dump_lmm(int level, void *lmm);
+
+/* lov_ea.c */
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size);
+void lsm_free_plain(struct lov_stripe_md *lsm);
+void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm);
+
+/* lproc_lov.c */
+#if defined (CONFIG_PROC_FS)
+extern const struct file_operations lov_proc_target_fops;
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+/* pools */
+extern cfs_hash_ops_t pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+void lov_pool_putref(struct pool_desc *pool);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+ LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+ atomic_inc(&lsm->lsm_refc);
+ return lsm;
+}
+
+static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi)
+{
+ if (unlikely(loi->loi_oi.oi.oi_id == 0 &&
+ loi->loi_oi.oi.oi_seq == 0 &&
+ loi->loi_ost_idx == 0 &&
+ loi->loi_ost_gen == 0))
+ return true;
+
+ return false;
+}
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c
new file mode 100644
index 000000000..cf96e0d01
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_io.c
@@ -0,0 +1,990 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+static inline void lov_sub_enter(struct lov_io_sub *sub)
+{
+ sub->sub_reenter++;
+}
+static inline void lov_sub_exit(struct lov_io_sub *sub)
+{
+ sub->sub_reenter--;
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+ struct lov_io_sub *sub)
+{
+ if (sub->sub_io != NULL) {
+ if (sub->sub_io_initialized) {
+ lov_sub_enter(sub);
+ cl_io_fini(sub->sub_env, sub->sub_io);
+ lov_sub_exit(sub);
+ sub->sub_io_initialized = 0;
+ lio->lis_active_subios--;
+ }
+ if (sub->sub_stripe == lio->lis_single_subio_index)
+ lio->lis_single_subio_index = -1;
+ else if (!sub->sub_borrowed)
+ OBD_FREE_PTR(sub->sub_io);
+ sub->sub_io = NULL;
+ }
+ if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+ if (!sub->sub_borrowed)
+ cl_env_put(sub->sub_env, &sub->sub_refcheck);
+ sub->sub_env = NULL;
+ }
+}
+
+static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio,
+ int stripe, loff_t start, loff_t end)
+{
+ struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+ struct cl_io *parent = lio->lis_cl.cis_io;
+
+ switch (io->ci_type) {
+ case CIT_SETATTR: {
+ io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+ io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+ io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa;
+ if (cl_io_is_trunc(io)) {
+ loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+ new_size = lov_size_to_stripe(lsm, new_size, stripe);
+ io->u.ci_setattr.sa_attr.lvb_size = new_size;
+ }
+ break;
+ }
+ case CIT_FAULT: {
+ struct cl_object *obj = parent->ci_obj;
+ loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+ io->u.ci_fault = parent->u.ci_fault;
+ off = lov_size_to_stripe(lsm, off, stripe);
+ io->u.ci_fault.ft_index = cl_index(obj, off);
+ break;
+ }
+ case CIT_FSYNC: {
+ io->u.ci_fsync.fi_start = start;
+ io->u.ci_fsync.fi_end = end;
+ io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa;
+ io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+ io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+ break;
+ }
+ case CIT_READ:
+ case CIT_WRITE: {
+ io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+ if (cl_io_is_append(parent)) {
+ io->u.ci_wr.wr_append = 1;
+ } else {
+ io->u.ci_rw.crw_pos = start;
+ io->u.ci_rw.crw_count = end - start;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+ struct lov_io_sub *sub)
+{
+ struct lov_object *lov = lio->lis_object;
+ struct lov_device *ld = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev);
+ struct cl_io *sub_io;
+ struct cl_object *sub_obj;
+ struct cl_io *io = lio->lis_cl.cis_io;
+
+ int stripe = sub->sub_stripe;
+ int result;
+
+ LASSERT(sub->sub_io == NULL);
+ LASSERT(sub->sub_env == NULL);
+ LASSERT(sub->sub_stripe < lio->lis_stripe_count);
+
+ if (unlikely(lov_r0(lov)->lo_sub[stripe] == NULL))
+ return -EIO;
+
+ result = 0;
+ sub->sub_io_initialized = 0;
+ sub->sub_borrowed = 0;
+
+ if (lio->lis_mem_frozen) {
+ LASSERT(mutex_is_locked(&ld->ld_mutex));
+ sub->sub_io = &ld->ld_emrg[stripe]->emrg_subio;
+ sub->sub_env = ld->ld_emrg[stripe]->emrg_env;
+ sub->sub_borrowed = 1;
+ } else {
+ void *cookie;
+
+ /* obtain new environment */
+ cookie = cl_env_reenter();
+ sub->sub_env = cl_env_get(&sub->sub_refcheck);
+ cl_env_reexit(cookie);
+ if (IS_ERR(sub->sub_env))
+ result = PTR_ERR(sub->sub_env);
+
+ if (result == 0) {
+ /*
+ * First sub-io. Use ->lis_single_subio to
+ * avoid dynamic allocation.
+ */
+ if (lio->lis_active_subios == 0) {
+ sub->sub_io = &lio->lis_single_subio;
+ lio->lis_single_subio_index = stripe;
+ } else {
+ OBD_ALLOC_PTR(sub->sub_io);
+ if (sub->sub_io == NULL)
+ result = -ENOMEM;
+ }
+ }
+ }
+
+ if (result == 0) {
+ sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]);
+ sub_io = sub->sub_io;
+
+ sub_io->ci_obj = sub_obj;
+ sub_io->ci_result = 0;
+
+ sub_io->ci_parent = io;
+ sub_io->ci_lockreq = io->ci_lockreq;
+ sub_io->ci_type = io->ci_type;
+ sub_io->ci_no_srvlock = io->ci_no_srvlock;
+ sub_io->ci_noatime = io->ci_noatime;
+
+ lov_sub_enter(sub);
+ result = cl_io_sub_init(sub->sub_env, sub_io,
+ io->ci_type, sub_obj);
+ lov_sub_exit(sub);
+ if (result >= 0) {
+ lio->lis_active_subios++;
+ sub->sub_io_initialized = 1;
+ result = 0;
+ }
+ }
+ if (result != 0)
+ lov_io_sub_fini(env, lio, sub);
+ return result;
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+ struct lov_io *lio, int stripe)
+{
+ int rc;
+ struct lov_io_sub *sub = &lio->lis_subs[stripe];
+
+ LASSERT(stripe < lio->lis_stripe_count);
+
+ if (!sub->sub_io_initialized) {
+ sub->sub_stripe = stripe;
+ rc = lov_io_sub_init(env, lio, sub);
+ } else
+ rc = 0;
+ if (rc == 0)
+ lov_sub_enter(sub);
+ else
+ sub = ERR_PTR(rc);
+ return sub;
+}
+
+void lov_sub_put(struct lov_io_sub *sub)
+{
+ lov_sub_exit(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+static int lov_page_stripe(const struct cl_page *page)
+{
+ struct lovsub_object *subobj;
+
+ subobj = lu2lovsub(
+ lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header,
+ &lovsub_device_type));
+ LASSERT(subobj != NULL);
+ return subobj->lso_index;
+}
+
+struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio,
+ const struct cl_page_slice *slice)
+{
+ struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+ struct cl_page *page = slice->cpl_page;
+ int stripe;
+
+ LASSERT(lio->lis_cl.cis_io != NULL);
+ LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object);
+ LASSERT(lsm != NULL);
+ LASSERT(lio->lis_nr_subios > 0);
+
+ stripe = lov_page_stripe(page);
+ return lov_sub_get(env, lio, stripe);
+}
+
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+ struct cl_io *io)
+{
+ struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+ int result;
+
+ LASSERT(lio->lis_object != NULL);
+
+ /*
+ * Need to be optimized, we can't afford to allocate a piece of memory
+ * when writing a page. -jay
+ */
+ OBD_ALLOC_LARGE(lio->lis_subs,
+ lsm->lsm_stripe_count * sizeof(lio->lis_subs[0]));
+ if (lio->lis_subs != NULL) {
+ lio->lis_nr_subios = lio->lis_stripe_count;
+ lio->lis_single_subio_index = -1;
+ lio->lis_active_subios = 0;
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+static void lov_io_slice_init(struct lov_io *lio,
+ struct lov_object *obj, struct cl_io *io)
+{
+ io->ci_result = 0;
+ lio->lis_object = obj;
+
+ LASSERT(obj->lo_lsm != NULL);
+ lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count;
+
+ switch (io->ci_type) {
+ case CIT_READ:
+ case CIT_WRITE:
+ lio->lis_pos = io->u.ci_rw.crw_pos;
+ lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+ lio->lis_io_endpos = lio->lis_endpos;
+ if (cl_io_is_append(io)) {
+ LASSERT(io->ci_type == CIT_WRITE);
+ lio->lis_pos = 0;
+ lio->lis_endpos = OBD_OBJECT_EOF;
+ }
+ break;
+
+ case CIT_SETATTR:
+ if (cl_io_is_trunc(io))
+ lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+ else
+ lio->lis_pos = 0;
+ lio->lis_endpos = OBD_OBJECT_EOF;
+ break;
+
+ case CIT_FAULT: {
+ pgoff_t index = io->u.ci_fault.ft_index;
+ lio->lis_pos = cl_offset(io->ci_obj, index);
+ lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+ break;
+ }
+
+ case CIT_FSYNC: {
+ lio->lis_pos = io->u.ci_fsync.fi_start;
+ lio->lis_endpos = io->u.ci_fsync.fi_end;
+ break;
+ }
+
+ case CIT_MISC:
+ lio->lis_pos = 0;
+ lio->lis_endpos = OBD_OBJECT_EOF;
+ break;
+
+ default:
+ LBUG();
+ }
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ struct lov_object *lov = cl2lov(ios->cis_obj);
+ int i;
+
+ if (lio->lis_subs != NULL) {
+ for (i = 0; i < lio->lis_nr_subios; i++)
+ lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+ OBD_FREE_LARGE(lio->lis_subs,
+ lio->lis_nr_subios * sizeof(lio->lis_subs[0]));
+ lio->lis_nr_subios = 0;
+ }
+
+ LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+ if (atomic_dec_and_test(&lov->lo_active_ios))
+ wake_up_all(&lov->lo_waitq);
+}
+
+static u64 lov_offset_mod(u64 val, int delta)
+{
+ if (val != OBD_OBJECT_EOF)
+ val += delta;
+ return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+ struct lov_io_sub *sub;
+ u64 endpos;
+ u64 start;
+ u64 end;
+ int stripe;
+ int rc = 0;
+
+ endpos = lov_offset_mod(lio->lis_endpos, -1);
+ for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) {
+ if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos,
+ endpos, &start, &end))
+ continue;
+
+ if (unlikely(lov_r0(lio->lis_object)->lo_sub[stripe] == NULL)) {
+ if (ios->cis_io->ci_type == CIT_READ ||
+ ios->cis_io->ci_type == CIT_WRITE ||
+ ios->cis_io->ci_type == CIT_FAULT)
+ return -EIO;
+
+ continue;
+ }
+
+ end = lov_offset_mod(end, 1);
+ sub = lov_sub_get(env, lio, stripe);
+ if (!IS_ERR(sub)) {
+ lov_io_sub_inherit(sub->sub_io, lio, stripe,
+ start, end);
+ rc = cl_io_iter_init(sub->sub_env, sub->sub_io);
+ lov_sub_put(sub);
+ CDEBUG(D_VFSTRACE, "shrink: %d [%llu, %llu)\n",
+ stripe, start, end);
+ } else
+ rc = PTR_ERR(sub);
+
+ if (!rc)
+ list_add_tail(&sub->sub_linkage, &lio->lis_active);
+ else
+ break;
+ }
+ return rc;
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+ __u64 start = io->u.ci_rw.crw_pos;
+ loff_t next;
+ unsigned long ssize = lsm->lsm_stripe_size;
+
+ LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+ /* fast path for common case. */
+ if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) {
+
+ lov_do_div64(start, ssize);
+ next = (start + 1) * ssize;
+ if (next <= start * ssize)
+ next = ~0ull;
+
+ io->ci_continue = next < lio->lis_io_endpos;
+ io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos,
+ next) - io->u.ci_rw.crw_pos;
+ lio->lis_pos = io->u.ci_rw.crw_pos;
+ lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+ CDEBUG(D_VFSTRACE, "stripe: %llu chunk: [%llu, %llu) %llu\n",
+ (__u64)start, lio->lis_pos, lio->lis_endpos,
+ (__u64)lio->lis_io_endpos);
+ }
+ /*
+ * XXX The following call should be optimized: we know, that
+ * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+ */
+ return lov_io_iter_init(env, ios);
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+ int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+ struct cl_io *parent = lio->lis_cl.cis_io;
+ struct lov_io_sub *sub;
+ int rc = 0;
+
+ list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+ lov_sub_enter(sub);
+ rc = iofunc(sub->sub_env, sub->sub_io);
+ lov_sub_exit(sub);
+ if (rc)
+ break;
+
+ if (parent->ci_result == 0)
+ parent->ci_result = sub->sub_io->ci_result;
+ }
+ return rc;
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ return lov_io_call(env, cl2lov_io(env, ios), cl_io_lock);
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ return lov_io_call(env, cl2lov_io(env, ios), cl_io_start);
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+ /*
+ * It's possible that lov_io_start() wasn't called against this
+ * sub-io, either because previous sub-io failed, or upper layer
+ * completed IO.
+ */
+ if (io->ci_state == CIS_IO_GOING)
+ cl_io_end(env, io);
+ else
+ io->ci_state = CIS_IO_FINISHED;
+ return 0;
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+ cl_io_iter_fini(env, io);
+ return 0;
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+ cl_io_unlock(env, io);
+ return 0;
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+ int rc;
+
+ rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+ LASSERT(rc == 0);
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ int rc;
+
+ rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+ LASSERT(rc == 0);
+ while (!list_empty(&lio->lis_active))
+ list_del_init(lio->lis_active.next);
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ int rc;
+
+ rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+ LASSERT(rc == 0);
+}
+
+
+static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld,
+ struct cl_page_list *qin,
+ int idx, int alloc)
+{
+ return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list;
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ enum cl_req_type crt, struct cl_2queue *queue)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ struct lov_object *obj = lio->lis_object;
+ struct lov_device *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev);
+ struct cl_page_list *qin = &queue->c2_qin;
+ struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q;
+ struct cl_page_list *stripes_qin = NULL;
+ struct cl_page *page;
+ struct cl_page *tmp;
+ int stripe;
+
+#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc)
+
+ int rc = 0;
+ int alloc =
+ !(current->flags & PF_MEMALLOC);
+
+ if (lio->lis_active_subios == 1) {
+ int idx = lio->lis_single_subio_index;
+ struct lov_io_sub *sub;
+
+ LASSERT(idx < lio->lis_nr_subios);
+ sub = lov_sub_get(env, lio, idx);
+ LASSERT(!IS_ERR(sub));
+ LASSERT(sub->sub_io == &lio->lis_single_subio);
+ rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+ crt, queue);
+ lov_sub_put(sub);
+ return rc;
+ }
+
+ LASSERT(lio->lis_subs != NULL);
+ if (alloc) {
+ OBD_ALLOC_LARGE(stripes_qin,
+ sizeof(*stripes_qin) * lio->lis_nr_subios);
+ if (stripes_qin == NULL)
+ return -ENOMEM;
+
+ for (stripe = 0; stripe < lio->lis_nr_subios; stripe++)
+ cl_page_list_init(&stripes_qin[stripe]);
+ } else {
+ /*
+ * If we get here, it means pageout & swap doesn't help.
+ * In order to not make things worse, even don't try to
+ * allocate the memory with __GFP_NOWARN. -jay
+ */
+ mutex_lock(&ld->ld_mutex);
+ lio->lis_mem_frozen = 1;
+ }
+
+ cl_2queue_init(cl2q);
+ cl_page_list_for_each_safe(page, tmp, qin) {
+ stripe = lov_page_stripe(page);
+ cl_page_list_move(QIN(stripe), qin, page);
+ }
+
+ for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+ struct lov_io_sub *sub;
+ struct cl_page_list *sub_qin = QIN(stripe);
+
+ if (list_empty(&sub_qin->pl_pages))
+ continue;
+
+ cl_page_list_splice(sub_qin, &cl2q->c2_qin);
+ sub = lov_sub_get(env, lio, stripe);
+ if (!IS_ERR(sub)) {
+ rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+ crt, cl2q);
+ lov_sub_put(sub);
+ } else
+ rc = PTR_ERR(sub);
+ cl_page_list_splice(&cl2q->c2_qin, &queue->c2_qin);
+ cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+ if (rc != 0)
+ break;
+ }
+
+ for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+ struct cl_page_list *sub_qin = QIN(stripe);
+
+ if (list_empty(&sub_qin->pl_pages))
+ continue;
+
+ cl_page_list_splice(sub_qin, qin);
+ }
+
+ if (alloc) {
+ OBD_FREE_LARGE(stripes_qin,
+ sizeof(*stripes_qin) * lio->lis_nr_subios);
+ } else {
+ int i;
+
+ for (i = 0; i < lio->lis_nr_subios; i++) {
+ struct cl_io *cio = lio->lis_subs[i].sub_io;
+
+ if (cio && cio == &ld->ld_emrg[i]->emrg_subio)
+ lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+ }
+ lio->lis_mem_frozen = 0;
+ mutex_unlock(&ld->ld_mutex);
+ }
+
+ return rc;
+#undef QIN
+}
+
+static int lov_io_prepare_write(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ const struct cl_page_slice *slice,
+ unsigned from, unsigned to)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ struct cl_page *sub_page = lov_sub_page(slice);
+ struct lov_io_sub *sub;
+ int result;
+
+ sub = lov_page_subio(env, lio, slice);
+ if (!IS_ERR(sub)) {
+ result = cl_io_prepare_write(sub->sub_env, sub->sub_io,
+ sub_page, from, to);
+ lov_sub_put(sub);
+ } else
+ result = PTR_ERR(sub);
+ return result;
+}
+
+static int lov_io_commit_write(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ const struct cl_page_slice *slice,
+ unsigned from, unsigned to)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ struct cl_page *sub_page = lov_sub_page(slice);
+ struct lov_io_sub *sub;
+ int result;
+
+ sub = lov_page_subio(env, lio, slice);
+ if (!IS_ERR(sub)) {
+ result = cl_io_commit_write(sub->sub_env, sub->sub_io,
+ sub_page, from, to);
+ lov_sub_put(sub);
+ } else
+ result = PTR_ERR(sub);
+ return result;
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_fault_io *fio;
+ struct lov_io *lio;
+ struct lov_io_sub *sub;
+
+ fio = &ios->cis_io->u.ci_fault;
+ lio = cl2lov_io(env, ios);
+ sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page));
+ sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob;
+ lov_sub_put(sub);
+ return lov_io_start(env, ios);
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct lov_io *lio = cl2lov_io(env, ios);
+ struct lov_io_sub *sub;
+ unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+
+ *written = 0;
+ list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+ struct cl_io *subio = sub->sub_io;
+
+ lov_sub_enter(sub);
+ lov_io_end_wrapper(sub->sub_env, subio);
+ lov_sub_exit(sub);
+
+ if (subio->ci_result == 0)
+ *written += subio->u.ci_fsync.fi_nr_written;
+ }
+}
+
+static const struct cl_io_operations lov_io_ops = {
+ .op = {
+ [CIT_READ] = {
+ .cio_fini = lov_io_fini,
+ .cio_iter_init = lov_io_rw_iter_init,
+ .cio_iter_fini = lov_io_iter_fini,
+ .cio_lock = lov_io_lock,
+ .cio_unlock = lov_io_unlock,
+ .cio_start = lov_io_start,
+ .cio_end = lov_io_end
+ },
+ [CIT_WRITE] = {
+ .cio_fini = lov_io_fini,
+ .cio_iter_init = lov_io_rw_iter_init,
+ .cio_iter_fini = lov_io_iter_fini,
+ .cio_lock = lov_io_lock,
+ .cio_unlock = lov_io_unlock,
+ .cio_start = lov_io_start,
+ .cio_end = lov_io_end
+ },
+ [CIT_SETATTR] = {
+ .cio_fini = lov_io_fini,
+ .cio_iter_init = lov_io_iter_init,
+ .cio_iter_fini = lov_io_iter_fini,
+ .cio_lock = lov_io_lock,
+ .cio_unlock = lov_io_unlock,
+ .cio_start = lov_io_start,
+ .cio_end = lov_io_end
+ },
+ [CIT_FAULT] = {
+ .cio_fini = lov_io_fini,
+ .cio_iter_init = lov_io_iter_init,
+ .cio_iter_fini = lov_io_iter_fini,
+ .cio_lock = lov_io_lock,
+ .cio_unlock = lov_io_unlock,
+ .cio_start = lov_io_fault_start,
+ .cio_end = lov_io_end
+ },
+ [CIT_FSYNC] = {
+ .cio_fini = lov_io_fini,
+ .cio_iter_init = lov_io_iter_init,
+ .cio_iter_fini = lov_io_iter_fini,
+ .cio_lock = lov_io_lock,
+ .cio_unlock = lov_io_unlock,
+ .cio_start = lov_io_start,
+ .cio_end = lov_io_fsync_end
+ },
+ [CIT_MISC] = {
+ .cio_fini = lov_io_fini
+ }
+ },
+ .req_op = {
+ [CRT_READ] = {
+ .cio_submit = lov_io_submit
+ },
+ [CRT_WRITE] = {
+ .cio_submit = lov_io_submit
+ }
+ },
+ .cio_prepare_write = lov_io_prepare_write,
+ .cio_commit_write = lov_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct lov_object *lov = cl2lov(ios->cis_obj);
+
+ if (atomic_dec_and_test(&lov->lo_active_ios))
+ wake_up_all(&lov->lo_waitq);
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+ struct cl_io_slice *ios)
+{
+ LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+ .op = {
+ [CIT_READ] = {
+ .cio_fini = lov_empty_io_fini,
+#if 0
+ .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+ .cio_lock = LOV_EMPTY_IMPOSSIBLE,
+ .cio_start = LOV_EMPTY_IMPOSSIBLE,
+ .cio_end = LOV_EMPTY_IMPOSSIBLE
+#endif
+ },
+ [CIT_WRITE] = {
+ .cio_fini = lov_empty_io_fini,
+ .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+ .cio_lock = LOV_EMPTY_IMPOSSIBLE,
+ .cio_start = LOV_EMPTY_IMPOSSIBLE,
+ .cio_end = LOV_EMPTY_IMPOSSIBLE
+ },
+ [CIT_SETATTR] = {
+ .cio_fini = lov_empty_io_fini,
+ .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+ .cio_lock = LOV_EMPTY_IMPOSSIBLE,
+ .cio_start = LOV_EMPTY_IMPOSSIBLE,
+ .cio_end = LOV_EMPTY_IMPOSSIBLE
+ },
+ [CIT_FAULT] = {
+ .cio_fini = lov_empty_io_fini,
+ .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+ .cio_lock = LOV_EMPTY_IMPOSSIBLE,
+ .cio_start = LOV_EMPTY_IMPOSSIBLE,
+ .cio_end = LOV_EMPTY_IMPOSSIBLE
+ },
+ [CIT_FSYNC] = {
+ .cio_fini = lov_empty_io_fini
+ },
+ [CIT_MISC] = {
+ .cio_fini = lov_empty_io_fini
+ }
+ },
+ .req_op = {
+ [CRT_READ] = {
+ .cio_submit = LOV_EMPTY_IMPOSSIBLE
+ },
+ [CRT_WRITE] = {
+ .cio_submit = LOV_EMPTY_IMPOSSIBLE
+ }
+ },
+ .cio_commit_write = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io)
+{
+ struct lov_io *lio = lov_env_io(env);
+ struct lov_object *lov = cl2lov(obj);
+
+ INIT_LIST_HEAD(&lio->lis_active);
+ lov_io_slice_init(lio, lov, io);
+ if (io->ci_result == 0) {
+ io->ci_result = lov_io_subio_init(env, lio, io);
+ if (io->ci_result == 0) {
+ cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+ atomic_inc(&lov->lo_active_ios);
+ }
+ }
+ return io->ci_result;
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io)
+{
+ struct lov_object *lov = cl2lov(obj);
+ struct lov_io *lio = lov_env_io(env);
+ int result;
+
+ lio->lis_object = lov;
+ switch (io->ci_type) {
+ default:
+ LBUG();
+ case CIT_MISC:
+ case CIT_READ:
+ result = 0;
+ break;
+ case CIT_FSYNC:
+ case CIT_SETATTR:
+ result = 1;
+ break;
+ case CIT_WRITE:
+ result = -EBADF;
+ break;
+ case CIT_FAULT:
+ result = -EFAULT;
+ CERROR("Page fault on a file without stripes: "DFID"\n",
+ PFID(lu_object_fid(&obj->co_lu)));
+ break;
+ }
+ if (result == 0) {
+ cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+ atomic_inc(&lov->lo_active_ios);
+ }
+
+ io->ci_result = result < 0 ? result : 0;
+ return result != 0;
+}
+
+int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io)
+{
+ struct lov_object *lov = cl2lov(obj);
+ struct lov_io *lio = lov_env_io(env);
+ int result;
+
+ LASSERT(lov->lo_lsm != NULL);
+ lio->lis_object = lov;
+
+ switch (io->ci_type) {
+ default:
+ LASSERTF(0, "invalid type %d\n", io->ci_type);
+ case CIT_MISC:
+ case CIT_FSYNC:
+ result = 1;
+ break;
+ case CIT_SETATTR:
+ /* the truncate to 0 is managed by MDT:
+ * - in open, for open O_TRUNC
+ * - in setattr, for truncate
+ */
+ /* the truncate is for size > 0 so triggers a restore */
+ if (cl_io_is_trunc(io))
+ io->ci_restore_needed = 1;
+ result = -ENODATA;
+ break;
+ case CIT_READ:
+ case CIT_WRITE:
+ case CIT_FAULT:
+ io->ci_restore_needed = 1;
+ result = -ENODATA;
+ break;
+ }
+ if (result == 0) {
+ cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+ atomic_inc(&lov->lo_active_ios);
+ }
+
+ io->ci_result = result < 0 ? result : 0;
+ return result != 0;
+}
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_lock.c b/drivers/staging/lustre/lustre/lov/lov_lock.c
new file mode 100644
index 000000000..f2eca565b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_lock.c
@@ -0,0 +1,1198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+ struct cl_lock *parent);
+
+static int lov_lock_unuse(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+ struct cl_lock *parent,
+ struct lov_lock_sub *lls)
+{
+ struct lov_sublock_env *subenv;
+ struct lov_io *lio = lov_env_io(env);
+ struct cl_io *io = lio->lis_cl.cis_io;
+ struct lov_io_sub *sub;
+
+ subenv = &lov_env_session(env)->ls_subenv;
+
+ /*
+ * FIXME: We tend to use the subio's env & io to call the sublock
+ * lock operations because osc lock sometimes stores some control
+ * variables in thread's IO information(Now only lockless information).
+ * However, if the lock's host(object) is different from the object
+ * for current IO, we have no way to get the subenv and subio because
+ * they are not initialized at all. As a temp fix, in this case,
+ * we still borrow the parent's env to call sublock operations.
+ */
+ if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+ subenv->lse_env = env;
+ subenv->lse_io = io;
+ subenv->lse_sub = NULL;
+ } else {
+ sub = lov_sub_get(env, lio, lls->sub_stripe);
+ if (!IS_ERR(sub)) {
+ subenv->lse_env = sub->sub_env;
+ subenv->lse_io = sub->sub_io;
+ subenv->lse_sub = sub;
+ } else {
+ subenv = (void *)sub;
+ }
+ }
+ return subenv;
+}
+
+static void lov_sublock_env_put(struct lov_sublock_env *subenv)
+{
+ if (subenv && subenv->lse_sub)
+ lov_sub_put(subenv->lse_sub);
+}
+
+static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck,
+ struct cl_lock *sublock, int idx,
+ struct lov_lock_link *link)
+{
+ struct lovsub_lock *lsl;
+ struct cl_lock *parent = lck->lls_cl.cls_lock;
+ int rc;
+
+ LASSERT(cl_lock_is_mutexed(parent));
+ LASSERT(cl_lock_is_mutexed(sublock));
+
+ lsl = cl2sub_lock(sublock);
+ /*
+ * check that sub-lock doesn't have lock link to this top-lock.
+ */
+ LASSERT(lov_lock_link_find(env, lck, lsl) == NULL);
+ LASSERT(idx < lck->lls_nr);
+
+ lck->lls_sub[idx].sub_lock = lsl;
+ lck->lls_nr_filled++;
+ LASSERT(lck->lls_nr_filled <= lck->lls_nr);
+ list_add_tail(&link->lll_list, &lsl->lss_parents);
+ link->lll_idx = idx;
+ link->lll_super = lck;
+ cl_lock_get(parent);
+ lu_ref_add(&parent->cll_reference, "lov-child", sublock);
+ lck->lls_sub[idx].sub_flags |= LSF_HELD;
+ cl_lock_user_add(env, sublock);
+
+ rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx);
+ LASSERT(rc == 0); /* there is no way this can fail, currently */
+}
+
+static struct cl_lock *lov_sublock_alloc(const struct lu_env *env,
+ const struct cl_io *io,
+ struct lov_lock *lck,
+ int idx, struct lov_lock_link **out)
+{
+ struct cl_lock *sublock;
+ struct cl_lock *parent;
+ struct lov_lock_link *link;
+
+ LASSERT(idx < lck->lls_nr);
+
+ OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, GFP_NOFS);
+ if (link != NULL) {
+ struct lov_sublock_env *subenv;
+ struct lov_lock_sub *lls;
+ struct cl_lock_descr *descr;
+
+ parent = lck->lls_cl.cls_lock;
+ lls = &lck->lls_sub[idx];
+ descr = &lls->sub_got;
+
+ subenv = lov_sublock_env_get(env, parent, lls);
+ if (!IS_ERR(subenv)) {
+ /* CAVEAT: Don't try to add a field in lov_lock_sub
+ * to remember the subio. This is because lock is able
+ * to be cached, but this is not true for IO. This
+ * further means a sublock might be referenced in
+ * different io context. -jay */
+
+ sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io,
+ descr, "lov-parent", parent);
+ lov_sublock_env_put(subenv);
+ } else {
+ /* error occurs. */
+ sublock = (void *)subenv;
+ }
+
+ if (!IS_ERR(sublock))
+ *out = link;
+ else
+ OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+ } else
+ sublock = ERR_PTR(-ENOMEM);
+ return sublock;
+}
+
+static void lov_sublock_unlock(const struct lu_env *env,
+ struct lovsub_lock *lsl,
+ struct cl_lock_closure *closure,
+ struct lov_sublock_env *subenv)
+{
+ lov_sublock_env_put(subenv);
+ lsl->lss_active = NULL;
+ cl_lock_disclosure(env, closure);
+}
+
+static int lov_sublock_lock(const struct lu_env *env,
+ struct lov_lock *lck,
+ struct lov_lock_sub *lls,
+ struct cl_lock_closure *closure,
+ struct lov_sublock_env **lsep)
+{
+ struct lovsub_lock *sublock;
+ struct cl_lock *child;
+ int result = 0;
+
+ LASSERT(list_empty(&closure->clc_list));
+
+ sublock = lls->sub_lock;
+ child = sublock->lss_cl.cls_lock;
+ result = cl_lock_closure_build(env, child, closure);
+ if (result == 0) {
+ struct cl_lock *parent = closure->clc_origin;
+
+ LASSERT(cl_lock_is_mutexed(child));
+ sublock->lss_active = parent;
+
+ if (unlikely((child->cll_state == CLS_FREEING) ||
+ (child->cll_flags & CLF_CANCELLED))) {
+ struct lov_lock_link *link;
+ /*
+ * we could race with lock deletion which temporarily
+ * put the lock in freeing state, bug 19080.
+ */
+ LASSERT(!(lls->sub_flags & LSF_HELD));
+
+ link = lov_lock_link_find(env, lck, sublock);
+ LASSERT(link != NULL);
+ lov_lock_unlink(env, link, sublock);
+ lov_sublock_unlock(env, sublock, closure, NULL);
+ lck->lls_cancel_race = 1;
+ result = CLO_REPEAT;
+ } else if (lsep) {
+ struct lov_sublock_env *subenv;
+ subenv = lov_sublock_env_get(env, parent, lls);
+ if (IS_ERR(subenv)) {
+ lov_sublock_unlock(env, sublock,
+ closure, NULL);
+ result = PTR_ERR(subenv);
+ } else {
+ *lsep = subenv;
+ }
+ }
+ }
+ return result;
+}
+
+/**
+ * Updates the result of a top-lock operation from a result of sub-lock
+ * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate
+ * over sub-locks and lov_subresult() is used to calculate return value of a
+ * top-operation. To this end, possible return values of sub-operations are
+ * ordered as
+ *
+ * - 0 success
+ * - CLO_WAIT wait for event
+ * - CLO_REPEAT repeat top-operation
+ * - -ne fundamental error
+ *
+ * Top-level return code can only go down through this list. CLO_REPEAT
+ * overwrites CLO_WAIT, because lock mutex was released and sleeping condition
+ * has to be rechecked by the upper layer.
+ */
+static int lov_subresult(int result, int rc)
+{
+ int result_rank;
+ int rc_rank;
+
+ LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT,
+ "result = %d", result);
+ LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT,
+ "rc = %d\n", rc);
+ CLASSERT(CLO_WAIT < CLO_REPEAT);
+
+ /* calculate ranks in the ordering above */
+ result_rank = result < 0 ? 1 + CLO_REPEAT : result;
+ rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc;
+
+ if (result_rank < rc_rank)
+ result = rc;
+ return result;
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static int lov_lock_sub_init(const struct lu_env *env,
+ struct lov_lock *lck, const struct cl_io *io)
+{
+ int result = 0;
+ int i;
+ int nr;
+ u64 start;
+ u64 end;
+ u64 file_start;
+ u64 file_end;
+
+ struct lov_object *loo = cl2lov(lck->lls_cl.cls_obj);
+ struct lov_layout_raid0 *r0 = lov_r0(loo);
+ struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+ lck->lls_orig = parent->cll_descr;
+ file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start);
+ file_end = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1;
+
+ for (i = 0, nr = 0; i < r0->lo_nr; i++) {
+ /*
+ * XXX for wide striping smarter algorithm is desirable,
+ * breaking out of the loop, early.
+ */
+ if (likely(r0->lo_sub[i] != NULL) &&
+ lov_stripe_intersects(loo->lo_lsm, i,
+ file_start, file_end, &start, &end))
+ nr++;
+ }
+ LASSERT(nr > 0);
+ OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof(lck->lls_sub[0]));
+ if (lck->lls_sub == NULL)
+ return -ENOMEM;
+
+ lck->lls_nr = nr;
+ /*
+ * First, fill in sub-lock descriptions in
+ * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc()
+ * (called below in this function, and by lov_lock_enqueue()) to
+ * create sub-locks. At this moment, no other thread can access
+ * top-lock.
+ */
+ for (i = 0, nr = 0; i < r0->lo_nr; ++i) {
+ if (likely(r0->lo_sub[i] != NULL) &&
+ lov_stripe_intersects(loo->lo_lsm, i,
+ file_start, file_end, &start, &end)) {
+ struct cl_lock_descr *descr;
+
+ descr = &lck->lls_sub[nr].sub_descr;
+
+ LASSERT(descr->cld_obj == NULL);
+ descr->cld_obj = lovsub2cl(r0->lo_sub[i]);
+ descr->cld_start = cl_index(descr->cld_obj, start);
+ descr->cld_end = cl_index(descr->cld_obj, end);
+ descr->cld_mode = parent->cll_descr.cld_mode;
+ descr->cld_gid = parent->cll_descr.cld_gid;
+ descr->cld_enq_flags = parent->cll_descr.cld_enq_flags;
+ /* XXX has no effect */
+ lck->lls_sub[nr].sub_got = *descr;
+ lck->lls_sub[nr].sub_stripe = i;
+ nr++;
+ }
+ }
+ LASSERT(nr == lck->lls_nr);
+
+ /*
+ * Some sub-locks can be missing at this point. This is not a problem,
+ * because enqueue will create them anyway. Main duty of this function
+ * is to fill in sub-lock descriptions in a race free manner.
+ */
+ return result;
+}
+
+static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
+ int i, int deluser, int rc)
+{
+ struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+ LASSERT(cl_lock_is_mutexed(parent));
+
+ if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+ struct cl_lock *sublock;
+ int dying;
+
+ LASSERT(lck->lls_sub[i].sub_lock != NULL);
+ sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+ LASSERT(cl_lock_is_mutexed(sublock));
+
+ lck->lls_sub[i].sub_flags &= ~LSF_HELD;
+ if (deluser)
+ cl_lock_user_del(env, sublock);
+ /*
+ * If the last hold is released, and cancellation is pending
+ * for a sub-lock, release parent mutex, to avoid keeping it
+ * while sub-lock is being paged out.
+ */
+ dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+ sublock->cll_descr.cld_mode == CLM_GROUP ||
+ (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
+ sublock->cll_holds == 1;
+ if (dying)
+ cl_lock_mutex_put(env, parent);
+ cl_lock_unhold(env, sublock, "lov-parent", parent);
+ if (dying) {
+ cl_lock_mutex_get(env, parent);
+ rc = lov_subresult(rc, CLO_REPEAT);
+ }
+ /*
+ * From now on lck->lls_sub[i].sub_lock is a "weak" pointer,
+ * not backed by a reference on a
+ * sub-lock. lovsub_lock_delete() will clear
+ * lck->lls_sub[i].sub_lock under semaphores, just before
+ * sub-lock is destroyed.
+ */
+ }
+ return rc;
+}
+
+static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck,
+ int i)
+{
+ struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+ LASSERT(cl_lock_is_mutexed(parent));
+
+ if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) {
+ struct cl_lock *sublock;
+
+ LASSERT(lck->lls_sub[i].sub_lock != NULL);
+ sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+ LASSERT(cl_lock_is_mutexed(sublock));
+ LASSERT(sublock->cll_state != CLS_FREEING);
+
+ lck->lls_sub[i].sub_flags |= LSF_HELD;
+
+ cl_lock_get_trust(sublock);
+ cl_lock_hold_add(env, sublock, "lov-parent", parent);
+ cl_lock_user_add(env, sublock);
+ cl_lock_put(env, sublock);
+ }
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+ struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck;
+ int i;
+
+ lck = cl2lov_lock(slice);
+ LASSERT(lck->lls_nr_filled == 0);
+ if (lck->lls_sub != NULL) {
+ for (i = 0; i < lck->lls_nr; ++i)
+ /*
+ * No sub-locks exists at this point, as sub-lock has
+ * a reference on its parent.
+ */
+ LASSERT(lck->lls_sub[i].sub_lock == NULL);
+ OBD_FREE_LARGE(lck->lls_sub,
+ lck->lls_nr * sizeof(lck->lls_sub[0]));
+ }
+ OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_lock_enqueue_wait(const struct lu_env *env,
+ struct lov_lock *lck,
+ struct cl_lock *sublock)
+{
+ struct cl_lock *lock = lck->lls_cl.cls_lock;
+ int result;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+
+ cl_lock_mutex_put(env, lock);
+ result = cl_lock_enqueue_wait(env, sublock, 0);
+ cl_lock_mutex_get(env, lock);
+ return result ?: CLO_REPEAT;
+}
+
+/**
+ * Tries to advance a state machine of a given sub-lock toward enqueuing of
+ * the top-lock.
+ *
+ * \retval 0 if state-transition can proceed
+ * \retval -ve otherwise.
+ */
+static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck,
+ struct cl_lock *sublock,
+ struct cl_io *io, __u32 enqflags, int last)
+{
+ int result;
+
+ /* first, try to enqueue a sub-lock ... */
+ result = cl_enqueue_try(env, sublock, io, enqflags);
+ if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) {
+ /* if it is enqueued, try to `wait' on it---maybe it's already
+ * granted */
+ result = cl_wait_try(env, sublock);
+ if (result == CLO_REENQUEUED)
+ result = CLO_WAIT;
+ }
+ /*
+ * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in
+ * parallel, otherwise---enqueue has to wait until sub-lock is granted
+ * before proceeding to the next one.
+ */
+ if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) &&
+ (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL)))
+ result = 0;
+ return result;
+}
+
+/**
+ * Helper function for lov_lock_enqueue() that creates missing sub-lock.
+ */
+static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
+ struct cl_io *io, struct lov_lock *lck, int idx)
+{
+ struct lov_lock_link *link = NULL;
+ struct cl_lock *sublock;
+ int result;
+
+ LASSERT(parent->cll_depth == 1);
+ cl_lock_mutex_put(env, parent);
+ sublock = lov_sublock_alloc(env, io, lck, idx, &link);
+ if (!IS_ERR(sublock))
+ cl_lock_mutex_get(env, sublock);
+ cl_lock_mutex_get(env, parent);
+
+ if (!IS_ERR(sublock)) {
+ cl_lock_get_trust(sublock);
+ if (parent->cll_state == CLS_QUEUING &&
+ lck->lls_sub[idx].sub_lock == NULL) {
+ lov_sublock_adopt(env, lck, sublock, idx, link);
+ } else {
+ OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+ /* other thread allocated sub-lock, or enqueue is no
+ * longer going on */
+ cl_lock_mutex_put(env, parent);
+ cl_lock_unhold(env, sublock, "lov-parent", parent);
+ cl_lock_mutex_get(env, parent);
+ }
+ cl_lock_mutex_put(env, sublock);
+ cl_lock_put(env, sublock);
+ result = CLO_REPEAT;
+ } else
+ result = PTR_ERR(sublock);
+ return result;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_io *io, __u32 enqflags)
+{
+ struct cl_lock *lock = slice->cls_lock;
+ struct lov_lock *lck = cl2lov_lock(slice);
+ struct cl_lock_closure *closure = lov_closure_get(env, lock);
+ int i;
+ int result;
+ enum cl_lock_state minstate;
+
+ for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+ int rc;
+ struct lovsub_lock *sub;
+ struct lov_lock_sub *lls;
+ struct cl_lock *sublock;
+ struct lov_sublock_env *subenv;
+
+ if (lock->cll_state != CLS_QUEUING) {
+ /*
+ * Lock might have left QUEUING state if previous
+ * iteration released its mutex. Stop enqueing in this
+ * case and let the upper layer to decide what to do.
+ */
+ LASSERT(i > 0 && result != 0);
+ break;
+ }
+
+ lls = &lck->lls_sub[i];
+ sub = lls->sub_lock;
+ /*
+ * Sub-lock might have been canceled, while top-lock was
+ * cached.
+ */
+ if (sub == NULL) {
+ result = lov_sublock_fill(env, lock, io, lck, i);
+ /* lov_sublock_fill() released @lock mutex,
+ * restart. */
+ break;
+ }
+ sublock = sub->lss_cl.cls_lock;
+ rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+ if (rc == 0) {
+ lov_sublock_hold(env, lck, i);
+ rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock,
+ subenv->lse_io, enqflags,
+ i == lck->lls_nr - 1);
+ minstate = min(minstate, sublock->cll_state);
+ if (rc == CLO_WAIT) {
+ switch (sublock->cll_state) {
+ case CLS_QUEUING:
+ /* take recursive mutex, the lock is
+ * released in lov_lock_enqueue_wait.
+ */
+ cl_lock_mutex_get(env, sublock);
+ lov_sublock_unlock(env, sub, closure,
+ subenv);
+ rc = lov_lock_enqueue_wait(env, lck,
+ sublock);
+ break;
+ case CLS_CACHED:
+ cl_lock_get(sublock);
+ /* take recursive mutex of sublock */
+ cl_lock_mutex_get(env, sublock);
+ /* need to release all locks in closure
+ * otherwise it may deadlock. LU-2683.*/
+ lov_sublock_unlock(env, sub, closure,
+ subenv);
+ /* sublock and parent are held. */
+ rc = lov_sublock_release(env, lck, i,
+ 1, rc);
+ cl_lock_mutex_put(env, sublock);
+ cl_lock_put(env, sublock);
+ break;
+ default:
+ lov_sublock_unlock(env, sub, closure,
+ subenv);
+ break;
+ }
+ } else {
+ LASSERT(sublock->cll_conflict == NULL);
+ lov_sublock_unlock(env, sub, closure, subenv);
+ }
+ }
+ result = lov_subresult(result, rc);
+ if (result != 0)
+ break;
+ }
+ cl_lock_closure_fini(closure);
+ return result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT;
+}
+
+static int lov_lock_unuse(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck = cl2lov_lock(slice);
+ struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+ int i;
+ int result;
+
+ for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+ int rc;
+ struct lovsub_lock *sub;
+ struct cl_lock *sublock;
+ struct lov_lock_sub *lls;
+ struct lov_sublock_env *subenv;
+
+ /* top-lock state cannot change concurrently, because single
+ * thread (one that released the last hold) carries unlocking
+ * to the completion. */
+ LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+ lls = &lck->lls_sub[i];
+ sub = lls->sub_lock;
+ if (sub == NULL)
+ continue;
+
+ sublock = sub->lss_cl.cls_lock;
+ rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+ if (rc == 0) {
+ if (lls->sub_flags & LSF_HELD) {
+ LASSERT(sublock->cll_state == CLS_HELD ||
+ sublock->cll_state == CLS_ENQUEUED);
+ rc = cl_unuse_try(subenv->lse_env, sublock);
+ rc = lov_sublock_release(env, lck, i, 0, rc);
+ }
+ lov_sublock_unlock(env, sub, closure, subenv);
+ }
+ result = lov_subresult(result, rc);
+ }
+
+ if (result == 0 && lck->lls_cancel_race) {
+ lck->lls_cancel_race = 0;
+ result = -ESTALE;
+ }
+ cl_lock_closure_fini(closure);
+ return result;
+}
+
+
+static void lov_lock_cancel(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck = cl2lov_lock(slice);
+ struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+ int i;
+ int result;
+
+ for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+ int rc;
+ struct lovsub_lock *sub;
+ struct cl_lock *sublock;
+ struct lov_lock_sub *lls;
+ struct lov_sublock_env *subenv;
+
+ /* top-lock state cannot change concurrently, because single
+ * thread (one that released the last hold) carries unlocking
+ * to the completion. */
+ lls = &lck->lls_sub[i];
+ sub = lls->sub_lock;
+ if (sub == NULL)
+ continue;
+
+ sublock = sub->lss_cl.cls_lock;
+ rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+ if (rc == 0) {
+ if (!(lls->sub_flags & LSF_HELD)) {
+ lov_sublock_unlock(env, sub, closure, subenv);
+ continue;
+ }
+
+ switch (sublock->cll_state) {
+ case CLS_HELD:
+ rc = cl_unuse_try(subenv->lse_env, sublock);
+ lov_sublock_release(env, lck, i, 0, 0);
+ break;
+ default:
+ lov_sublock_release(env, lck, i, 1, 0);
+ break;
+ }
+ lov_sublock_unlock(env, sub, closure, subenv);
+ }
+
+ if (rc == CLO_REPEAT) {
+ --i;
+ continue;
+ }
+
+ result = lov_subresult(result, rc);
+ }
+
+ if (result)
+ CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+ "lov_lock_cancel fails with %d.\n", result);
+
+ cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_wait(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck = cl2lov_lock(slice);
+ struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+ enum cl_lock_state minstate;
+ int reenqueued;
+ int result;
+ int i;
+
+again:
+ for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0;
+ i < lck->lls_nr; ++i) {
+ int rc;
+ struct lovsub_lock *sub;
+ struct cl_lock *sublock;
+ struct lov_lock_sub *lls;
+ struct lov_sublock_env *subenv;
+
+ lls = &lck->lls_sub[i];
+ sub = lls->sub_lock;
+ LASSERT(sub != NULL);
+ sublock = sub->lss_cl.cls_lock;
+ rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+ if (rc == 0) {
+ LASSERT(sublock->cll_state >= CLS_ENQUEUED);
+ if (sublock->cll_state < CLS_HELD)
+ rc = cl_wait_try(env, sublock);
+
+ minstate = min(minstate, sublock->cll_state);
+ lov_sublock_unlock(env, sub, closure, subenv);
+ }
+ if (rc == CLO_REENQUEUED) {
+ reenqueued++;
+ rc = 0;
+ }
+ result = lov_subresult(result, rc);
+ if (result != 0)
+ break;
+ }
+ /* Each sublock only can be reenqueued once, so will not loop for
+ * ever. */
+ if (result == 0 && reenqueued != 0)
+ goto again;
+ cl_lock_closure_fini(closure);
+ return result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT;
+}
+
+static int lov_lock_use(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck = cl2lov_lock(slice);
+ struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+ int result;
+ int i;
+
+ LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+ for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+ int rc;
+ struct lovsub_lock *sub;
+ struct cl_lock *sublock;
+ struct lov_lock_sub *lls;
+ struct lov_sublock_env *subenv;
+
+ LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+ lls = &lck->lls_sub[i];
+ sub = lls->sub_lock;
+ if (sub == NULL) {
+ /*
+ * Sub-lock might have been canceled, while top-lock was
+ * cached.
+ */
+ result = -ESTALE;
+ break;
+ }
+
+ sublock = sub->lss_cl.cls_lock;
+ rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+ if (rc == 0) {
+ LASSERT(sublock->cll_state != CLS_FREEING);
+ lov_sublock_hold(env, lck, i);
+ if (sublock->cll_state == CLS_CACHED) {
+ rc = cl_use_try(subenv->lse_env, sublock, 0);
+ if (rc != 0)
+ rc = lov_sublock_release(env, lck,
+ i, 1, rc);
+ } else if (sublock->cll_state == CLS_NEW) {
+ /* Sub-lock might have been canceled, while
+ * top-lock was cached. */
+ result = -ESTALE;
+ lov_sublock_release(env, lck, i, 1, result);
+ }
+ lov_sublock_unlock(env, sub, closure, subenv);
+ }
+ result = lov_subresult(result, rc);
+ if (result != 0)
+ break;
+ }
+
+ if (lck->lls_cancel_race) {
+ /*
+ * If there is unlocking happened at the same time, then
+ * sublock_lock state should be FREEING, and lov_sublock_lock
+ * should return CLO_REPEAT. In this case, it should return
+ * ESTALE, and up layer should reset the lock state to be NEW.
+ */
+ lck->lls_cancel_race = 0;
+ LASSERT(result != 0);
+ result = -ESTALE;
+ }
+ cl_lock_closure_fini(closure);
+ return result;
+}
+
+#if 0
+static int lock_lock_multi_match()
+{
+ struct cl_lock *lock = slice->cls_lock;
+ struct cl_lock_descr *subneed = &lov_env_info(env)->lti_ldescr;
+ struct lov_object *loo = cl2lov(lov->lls_cl.cls_obj);
+ struct lov_layout_raid0 *r0 = lov_r0(loo);
+ struct lov_lock_sub *sub;
+ struct cl_object *subobj;
+ u64 fstart;
+ u64 fend;
+ u64 start;
+ u64 end;
+ int i;
+
+ fstart = cl_offset(need->cld_obj, need->cld_start);
+ fend = cl_offset(need->cld_obj, need->cld_end + 1) - 1;
+ subneed->cld_mode = need->cld_mode;
+ cl_lock_mutex_get(env, lock);
+ for (i = 0; i < lov->lls_nr; ++i) {
+ sub = &lov->lls_sub[i];
+ if (sub->sub_lock == NULL)
+ continue;
+ subobj = sub->sub_descr.cld_obj;
+ if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe,
+ fstart, fend, &start, &end))
+ continue;
+ subneed->cld_start = cl_index(subobj, start);
+ subneed->cld_end = cl_index(subobj, end);
+ subneed->cld_obj = subobj;
+ if (!cl_lock_ext_match(&sub->sub_got, subneed)) {
+ result = 0;
+ break;
+ }
+ }
+ cl_lock_mutex_put(env, lock);
+}
+#endif
+
+/**
+ * Check if the extent region \a descr is covered by \a child against the
+ * specific \a stripe.
+ */
+static int lov_lock_stripe_is_matching(const struct lu_env *env,
+ struct lov_object *lov, int stripe,
+ const struct cl_lock_descr *child,
+ const struct cl_lock_descr *descr)
+{
+ struct lov_stripe_md *lsm = lov->lo_lsm;
+ u64 start;
+ u64 end;
+ int result;
+
+ if (lov_r0(lov)->lo_nr == 1)
+ return cl_lock_ext_match(child, descr);
+
+ /*
+ * For a multi-stripes object:
+ * - make sure the descr only covers child's stripe, and
+ * - check if extent is matching.
+ */
+ start = cl_offset(&lov->lo_cl, descr->cld_start);
+ end = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1;
+ result = 0;
+ /* glimpse should work on the object with LOV EA hole. */
+ if (end - start <= lsm->lsm_stripe_size) {
+ int idx;
+
+ idx = lov_stripe_number(lsm, start);
+ if (idx == stripe ||
+ unlikely(lov_r0(lov)->lo_sub[idx] == NULL)) {
+ idx = lov_stripe_number(lsm, end);
+ if (idx == stripe ||
+ unlikely(lov_r0(lov)->lo_sub[idx] == NULL))
+ result = 1;
+ }
+ }
+
+ if (result != 0) {
+ struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr;
+ u64 sub_start;
+ u64 sub_end;
+
+ subd->cld_obj = NULL; /* don't need sub object at all */
+ subd->cld_mode = descr->cld_mode;
+ subd->cld_gid = descr->cld_gid;
+ result = lov_stripe_intersects(lsm, stripe, start, end,
+ &sub_start, &sub_end);
+ LASSERT(result);
+ subd->cld_start = cl_index(child->cld_obj, sub_start);
+ subd->cld_end = cl_index(child->cld_obj, sub_end);
+ result = cl_lock_ext_match(child, subd);
+ }
+ return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_fits_into() method.
+ *
+ * Checks whether a lock (given by \a slice) is suitable for \a
+ * io. Multi-stripe locks can be used only for "quick" io, like truncate, or
+ * O_APPEND write.
+ *
+ * \see ccc_lock_fits_into().
+ */
+static int lov_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io)
+{
+ struct lov_lock *lov = cl2lov_lock(slice);
+ struct lov_object *obj = cl2lov(slice->cls_obj);
+ int result;
+
+ LASSERT(cl_object_same(need->cld_obj, slice->cls_obj));
+ LASSERT(lov->lls_nr > 0);
+
+ /* for top lock, it's necessary to match enq flags otherwise it will
+ * run into problem if a sublock is missing and reenqueue. */
+ if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags)
+ return 0;
+
+ if (need->cld_mode == CLM_GROUP)
+ /*
+ * always allow to match group lock.
+ */
+ result = cl_lock_ext_match(&lov->lls_orig, need);
+ else if (lov->lls_nr == 1) {
+ struct cl_lock_descr *got = &lov->lls_sub[0].sub_got;
+ result = lov_lock_stripe_is_matching(env,
+ cl2lov(slice->cls_obj),
+ lov->lls_sub[0].sub_stripe,
+ got, need);
+ } else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC &&
+ !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM)
+ /*
+ * Multi-stripe locks are only suitable for `quick' IO and for
+ * glimpse.
+ */
+ result = 0;
+ else
+ /*
+ * Most general case: multi-stripe existing lock, and
+ * (potentially) multi-stripe @need lock. Check that @need is
+ * covered by @lov's sub-locks.
+ *
+ * For now, ignore lock expansions made by the server, and
+ * match against original lock extent.
+ */
+ result = cl_lock_ext_match(&lov->lls_orig, need);
+ CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n",
+ PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got),
+ lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr,
+ result);
+ return result;
+}
+
+void lov_lock_unlink(const struct lu_env *env,
+ struct lov_lock_link *link, struct lovsub_lock *sub)
+{
+ struct lov_lock *lck = link->lll_super;
+ struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+ LASSERT(cl_lock_is_mutexed(parent));
+ LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+
+ list_del_init(&link->lll_list);
+ LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub);
+ /* yank this sub-lock from parent's array */
+ lck->lls_sub[link->lll_idx].sub_lock = NULL;
+ LASSERT(lck->lls_nr_filled > 0);
+ lck->lls_nr_filled--;
+ lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock);
+ cl_lock_put(env, parent);
+ OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+}
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+ struct lov_lock *lck,
+ struct lovsub_lock *sub)
+{
+ struct lov_lock_link *scan;
+
+ LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+
+ list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+ if (scan->lll_super == lck)
+ return scan;
+ }
+ return NULL;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked for "top-to-bottom" delete, when lock destruction starts from the
+ * top-lock, e.g., as a result of inode destruction.
+ *
+ * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there:
+ * this is done separately elsewhere:
+ *
+ * - for inode destruction, lov_object_delete() calls cl_object_kill() for
+ * each sub-object, purging its locks;
+ *
+ * - in other cases (e.g., a fatal error with a top-lock) sub-locks are
+ * left in the cache.
+ */
+static void lov_lock_delete(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck = cl2lov_lock(slice);
+ struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+ struct lov_lock_link *link;
+ int rc;
+ int i;
+
+ LASSERT(slice->cls_lock->cll_state == CLS_FREEING);
+
+ for (i = 0; i < lck->lls_nr; ++i) {
+ struct lov_lock_sub *lls = &lck->lls_sub[i];
+ struct lovsub_lock *lsl = lls->sub_lock;
+
+ if (lsl == NULL) /* already removed */
+ continue;
+
+ rc = lov_sublock_lock(env, lck, lls, closure, NULL);
+ if (rc == CLO_REPEAT) {
+ --i;
+ continue;
+ }
+
+ LASSERT(rc == 0);
+ LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING);
+
+ if (lls->sub_flags & LSF_HELD)
+ lov_sublock_release(env, lck, i, 1, 0);
+
+ link = lov_lock_link_find(env, lck, lsl);
+ LASSERT(link != NULL);
+ lov_lock_unlink(env, link, lsl);
+ LASSERT(lck->lls_sub[i].sub_lock == NULL);
+
+ lov_sublock_unlock(env, lsl, closure, NULL);
+ }
+
+ cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck = cl2lov_lock(slice);
+ int i;
+
+ (*p)(env, cookie, "%d\n", lck->lls_nr);
+ for (i = 0; i < lck->lls_nr; ++i) {
+ struct lov_lock_sub *sub;
+
+ sub = &lck->lls_sub[i];
+ (*p)(env, cookie, " %d %x: ", i, sub->sub_flags);
+ if (sub->sub_lock != NULL)
+ cl_lock_print(env, cookie, p,
+ sub->sub_lock->lss_cl.cls_lock);
+ else
+ (*p)(env, cookie, "---\n");
+ }
+ return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+ .clo_fini = lov_lock_fini,
+ .clo_enqueue = lov_lock_enqueue,
+ .clo_wait = lov_lock_wait,
+ .clo_use = lov_lock_use,
+ .clo_unuse = lov_lock_unuse,
+ .clo_cancel = lov_lock_cancel,
+ .clo_fits_into = lov_lock_fits_into,
+ .clo_delete = lov_lock_delete,
+ .clo_print = lov_lock_print
+};
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io)
+{
+ struct lov_lock *lck;
+ int result;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
+ if (lck != NULL) {
+ cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+ result = lov_lock_sub_init(env, lck, io);
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+ struct cl_lock_slice *slice)
+{
+ struct lov_lock *lck = cl2lov_lock(slice);
+ OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct cl_lock_slice *slice)
+{
+ (*p)(env, cookie, "empty\n");
+ return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+ .clo_fini = lov_empty_lock_fini,
+ .clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io)
+{
+ struct lov_lock *lck;
+ int result = -ENOMEM;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, GFP_NOFS);
+ if (lck != NULL) {
+ cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+ lck->lls_orig = lock->cll_descr;
+ result = 0;
+ }
+ return result;
+}
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+ struct cl_lock *parent)
+{
+ struct cl_lock_closure *closure;
+
+ closure = &lov_env_info(env)->lti_closure;
+ LASSERT(list_empty(&closure->clc_list));
+ cl_lock_closure_init(env, closure, parent, 1);
+ return closure;
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c
new file mode 100644
index 000000000..b7e7bfabe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_merge.c
@@ -0,0 +1,186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+ struct ost_lvb *lvb, __u64 *kms_place)
+{
+ __u64 size = 0;
+ __u64 kms = 0;
+ __u64 blocks = 0;
+ s64 current_mtime = lvb->lvb_mtime;
+ s64 current_atime = lvb->lvb_atime;
+ s64 current_ctime = lvb->lvb_ctime;
+ int i;
+ int rc = 0;
+
+ assert_spin_locked(&lsm->lsm_lock);
+ LASSERT(lsm->lsm_lock_owner == current_pid());
+
+ CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s=%llu m=%llu a=%llu c=%llu b=%llu\n",
+ POSTID(&lsm->lsm_oi), lvb->lvb_size, lvb->lvb_mtime,
+ lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks);
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+ u64 lov_size, tmpsize;
+
+ if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+ rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+ continue;
+ }
+
+ tmpsize = loi->loi_kms;
+ lov_size = lov_stripe_size(lsm, tmpsize, i);
+ if (lov_size > kms)
+ kms = lov_size;
+
+ if (loi->loi_lvb.lvb_size > tmpsize)
+ tmpsize = loi->loi_lvb.lvb_size;
+
+ lov_size = lov_stripe_size(lsm, tmpsize, i);
+ if (lov_size > size)
+ size = lov_size;
+ /* merge blocks, mtime, atime */
+ blocks += loi->loi_lvb.lvb_blocks;
+ if (loi->loi_lvb.lvb_mtime > current_mtime)
+ current_mtime = loi->loi_lvb.lvb_mtime;
+ if (loi->loi_lvb.lvb_atime > current_atime)
+ current_atime = loi->loi_lvb.lvb_atime;
+ if (loi->loi_lvb.lvb_ctime > current_ctime)
+ current_ctime = loi->loi_lvb.lvb_ctime;
+
+ CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s=%llu m=%llu a=%llu c=%llu b=%llu\n",
+ POSTID(&lsm->lsm_oi), loi->loi_ost_idx,
+ loi->loi_lvb.lvb_size, loi->loi_lvb.lvb_mtime,
+ loi->loi_lvb.lvb_atime, loi->loi_lvb.lvb_ctime,
+ loi->loi_lvb.lvb_blocks);
+ }
+
+ *kms_place = kms;
+ lvb->lvb_size = size;
+ lvb->lvb_blocks = blocks;
+ lvb->lvb_mtime = current_mtime;
+ lvb->lvb_atime = current_atime;
+ lvb->lvb_ctime = current_ctime;
+ return rc;
+}
+
+/* Must be called under the lov_stripe_lock() */
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+ u64 size, int shrink)
+{
+ struct lov_oinfo *loi;
+ int stripe = 0;
+ __u64 kms;
+
+ assert_spin_locked(&lsm->lsm_lock);
+ LASSERT(lsm->lsm_lock_owner == current_pid());
+
+ if (shrink) {
+ for (; stripe < lsm->lsm_stripe_count; stripe++) {
+ struct lov_oinfo *loi = lsm->lsm_oinfo[stripe];
+ kms = lov_size_to_stripe(lsm, size, stripe);
+ CDEBUG(D_INODE,
+ "stripe %d KMS %sing %llu->%llu\n",
+ stripe, kms > loi->loi_kms ? "increase":"shrink",
+ loi->loi_kms, kms);
+ loi_kms_set(loi, loi->loi_lvb.lvb_size = kms);
+ }
+ return 0;
+ }
+
+ if (size > 0)
+ stripe = lov_stripe_number(lsm, size - 1);
+ kms = lov_size_to_stripe(lsm, size, stripe);
+ loi = lsm->lsm_oinfo[stripe];
+
+ CDEBUG(D_INODE, "stripe %d KMS %sincreasing %llu->%llu\n",
+ stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms);
+ if (kms > loi->loi_kms)
+ loi_kms_set(loi, kms);
+
+ return 0;
+}
+
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, u64 valid,
+ struct lov_stripe_md *lsm, int stripeno, int *set)
+{
+ valid &= src->o_valid;
+
+ if (*set) {
+ if (valid & OBD_MD_FLSIZE) {
+ /* this handles sparse files properly */
+ u64 lov_size;
+
+ lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
+ if (lov_size > tgt->o_size)
+ tgt->o_size = lov_size;
+ }
+ if (valid & OBD_MD_FLBLOCKS)
+ tgt->o_blocks += src->o_blocks;
+ if (valid & OBD_MD_FLBLKSZ)
+ tgt->o_blksize += src->o_blksize;
+ if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
+ tgt->o_ctime = src->o_ctime;
+ if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
+ tgt->o_mtime = src->o_mtime;
+ if (valid & OBD_MD_FLDATAVERSION)
+ tgt->o_data_version += src->o_data_version;
+ } else {
+ memcpy(tgt, src, sizeof(*tgt));
+ tgt->o_oi = lsm->lsm_oi;
+ if (valid & OBD_MD_FLSIZE)
+ tgt->o_size = lov_stripe_size(lsm, src->o_size,
+ stripeno);
+ }
+
+ /* data_version needs to be valid on all stripes to be correct! */
+ if (!(valid & OBD_MD_FLDATAVERSION))
+ tgt->o_valid &= ~OBD_MD_FLDATAVERSION;
+
+ *set += 1;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c
new file mode 100644
index 000000000..027815766
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_obd.c
@@ -0,0 +1,2395 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_mds.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_param.h"
+#include "../include/cl_object.h"
+#include "../include/lclient.h" /* for cl_client_lru */
+#include "../include/lustre/ll_fiemap.h"
+#include "../include/lustre_fid.h"
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+ Any function that expects lov_tgts to remain stationary must take a ref. */
+static void lov_getref(struct obd_device *obd)
+{
+ struct lov_obd *lov = &obd->u.lov;
+
+ /* nobody gets through here until lov_putref is done */
+ mutex_lock(&lov->lov_lock);
+ atomic_inc(&lov->lov_refcount);
+ mutex_unlock(&lov->lov_lock);
+ return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+ struct lov_obd *lov = &obd->u.lov;
+
+ mutex_lock(&lov->lov_lock);
+ /* ok to dec to 0 more than once -- ltd_exp's will be null */
+ if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+ LIST_HEAD(kill);
+ int i;
+ struct lov_tgt_desc *tgt, *n;
+ CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+ lov->lov_death_row);
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ tgt = lov->lov_tgts[i];
+
+ if (!tgt || !tgt->ltd_reap)
+ continue;
+ list_add(&tgt->ltd_kill, &kill);
+ /* XXX - right now there is a dependency on ld_tgt_count
+ * being the maximum tgt index for computing the
+ * mds_max_easize. So we can't shrink it. */
+ lov_ost_pool_remove(&lov->lov_packed, i);
+ lov->lov_tgts[i] = NULL;
+ lov->lov_death_row--;
+ }
+ mutex_unlock(&lov->lov_lock);
+
+ list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+ list_del(&tgt->ltd_kill);
+ /* Disconnect */
+ __lov_del_obd(obd, tgt);
+ }
+ } else {
+ mutex_unlock(&lov->lov_lock);
+ }
+}
+
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+ enum obd_notify_event ev);
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+ enum obd_notify_event ev, void *data);
+
+
+#define MAX_STRING_SIZE 128
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+ struct obd_connect_data *data)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ struct obd_uuid *tgt_uuid;
+ struct obd_device *tgt_obd;
+ static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+ struct obd_import *imp;
+ struct proc_dir_entry *lov_proc_dir;
+ int rc;
+
+ if (!lov->lov_tgts[index])
+ return -EINVAL;
+
+ tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+ tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+ if (!tgt_obd->obd_set_up) {
+ CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+ return -EINVAL;
+ }
+
+ /* override the sp_me from lov */
+ tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+ if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+ data->ocd_index = index;
+
+ /*
+ * Divine LOV knows that OBDs under it are OSCs.
+ */
+ imp = tgt_obd->u.cli.cl_import;
+
+ if (activate) {
+ tgt_obd->obd_no_recov = 0;
+ /* FIXME this is probably supposed to be
+ ptlrpc_set_import_active. Horrible naming. */
+ ptlrpc_activate_import(imp);
+ }
+
+ rc = obd_register_observer(tgt_obd, obd);
+ if (rc) {
+ CERROR("Target %s register_observer error %d\n",
+ obd_uuid2str(tgt_uuid), rc);
+ return rc;
+ }
+
+
+ if (imp->imp_invalid) {
+ CDEBUG(D_CONFIG, "not connecting OSC %s; administratively disabled\n",
+ obd_uuid2str(tgt_uuid));
+ return 0;
+ }
+
+ rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+ &lov_osc_uuid, data, NULL);
+ if (rc || !lov->lov_tgts[index]->ltd_exp) {
+ CERROR("Target %s connect error %d\n",
+ obd_uuid2str(tgt_uuid), rc);
+ return -ENODEV;
+ }
+
+ lov->lov_tgts[index]->ltd_reap = 0;
+
+ CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+ obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+ lov_proc_dir = obd->obd_proc_private;
+ if (lov_proc_dir) {
+ struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+ struct proc_dir_entry *osc_symlink;
+
+ LASSERT(osc_obd != NULL);
+ LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+ LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+ osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+ lov_proc_dir,
+ "../../../%s/%s",
+ osc_obd->obd_type->typ_name,
+ osc_obd->obd_name);
+ if (osc_symlink == NULL) {
+ CERROR("could not register LOV target /proc/fs/lustre/%s/%s/target_obds/%s.",
+ obd->obd_type->typ_name, obd->obd_name,
+ osc_obd->obd_name);
+ lprocfs_remove(&lov_proc_dir);
+ obd->obd_proc_private = NULL;
+ }
+ }
+
+ return 0;
+}
+
+static int lov_connect(const struct lu_env *env,
+ struct obd_export **exp, struct obd_device *obd,
+ struct obd_uuid *cluuid, struct obd_connect_data *data,
+ void *localdata)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ struct lov_tgt_desc *tgt;
+ struct lustre_handle conn;
+ int i, rc;
+
+ CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+ rc = class_connect(&conn, obd, cluuid);
+ if (rc)
+ return rc;
+
+ *exp = class_conn2export(&conn);
+
+ /* Why should there ever be more than 1 connect? */
+ lov->lov_connects++;
+ LASSERT(lov->lov_connects == 1);
+
+ memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+ if (data)
+ lov->lov_ocd = *data;
+
+ obd_getref(obd);
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ tgt = lov->lov_tgts[i];
+ if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+ continue;
+ /* Flags will be lowest common denominator */
+ rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+ if (rc) {
+ CERROR("%s: lov connect tgt %d failed: %d\n",
+ obd->obd_name, i, rc);
+ continue;
+ }
+ /* connect to administrative disabled ost */
+ if (!lov->lov_tgts[i]->ltd_exp)
+ continue;
+
+ rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+ OBD_NOTIFY_CONNECT, (void *)&i);
+ if (rc) {
+ CERROR("%s error sending notify %d\n",
+ obd->obd_name, rc);
+ }
+ }
+ obd_putref(obd);
+
+ return 0;
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+ struct proc_dir_entry *lov_proc_dir;
+ struct lov_obd *lov = &obd->u.lov;
+ struct obd_device *osc_obd;
+ int rc;
+
+ osc_obd = class_exp2obd(tgt->ltd_exp);
+ CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
+ obd->obd_name, osc_obd ? osc_obd->obd_name : "NULL");
+
+ if (tgt->ltd_active) {
+ tgt->ltd_active = 0;
+ lov->desc.ld_active_tgt_count--;
+ tgt->ltd_exp->exp_obd->obd_inactive = 1;
+ }
+
+ if (osc_obd) {
+ lov_proc_dir = obd->obd_proc_private;
+ if (lov_proc_dir) {
+ lprocfs_remove_proc_entry(osc_obd->obd_name, lov_proc_dir);
+ }
+ /* Pass it on to our clients.
+ * XXX This should be an argument to disconnect,
+ * XXX not a back-door flag on the OBD. Ah well.
+ */
+ osc_obd->obd_force = obd->obd_force;
+ osc_obd->obd_fail = obd->obd_fail;
+ osc_obd->obd_no_recov = obd->obd_no_recov;
+ }
+
+ obd_register_observer(osc_obd, NULL);
+
+ rc = obd_disconnect(tgt->ltd_exp);
+ if (rc) {
+ CERROR("Target %s disconnect error %d\n",
+ tgt->ltd_uuid.uuid, rc);
+ rc = 0;
+ }
+
+ tgt->ltd_exp = NULL;
+ return 0;
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lov_obd *lov = &obd->u.lov;
+ int i, rc;
+
+ if (!lov->lov_tgts)
+ goto out;
+
+ /* Only disconnect the underlying layers on the final disconnect. */
+ lov->lov_connects--;
+ if (lov->lov_connects != 0) {
+ /* why should there be more than 1 connect? */
+ CERROR("disconnect #%d\n", lov->lov_connects);
+ goto out;
+ }
+
+ /* Let's hold another reference so lov_del_obd doesn't spin through
+ putref every time */
+ obd_getref(obd);
+
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+ /* Disconnection is the last we know about an obd */
+ lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen);
+ }
+ }
+ obd_putref(obd);
+
+out:
+ rc = class_disconnect(exp); /* bz 9811 */
+ return rc;
+}
+
+/* Error codes:
+ *
+ * -EINVAL : UUID can't be found in the LOV's target list
+ * -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ * -EBADF : The UUID is found, but the OBD is the wrong type (!)
+ * any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+ enum obd_notify_event ev)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ struct lov_tgt_desc *tgt;
+ int index, activate, active;
+
+ CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+ lov, uuid->uuid, ev);
+
+ obd_getref(obd);
+ for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+ tgt = lov->lov_tgts[index];
+ if (!tgt)
+ continue;
+ /*
+ * LU-642, initially inactive OSC could miss the obd_connect,
+ * we make up for it here.
+ */
+ if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
+ obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
+ struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+ obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+ &lov_osc_uuid, &lov->lov_ocd, NULL);
+ }
+ if (!tgt->ltd_exp)
+ continue;
+
+ CDEBUG(D_INFO, "lov idx %d is %s conn %#llx\n",
+ index, obd_uuid2str(&tgt->ltd_uuid),
+ tgt->ltd_exp->exp_handle.h_cookie);
+ if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+ break;
+ }
+
+ if (index == lov->desc.ld_tgt_count) {
+ index = -EINVAL;
+ goto out;
+ }
+
+ if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+ activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
+
+ if (lov->lov_tgts[index]->ltd_activate == activate) {
+ CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+ uuid->uuid, activate ? "" : "de");
+ } else {
+ lov->lov_tgts[index]->ltd_activate = activate;
+ CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+ activate ? "" : "de", obd_uuid2str(uuid));
+ }
+
+ } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+ active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0;
+
+ if (lov->lov_tgts[index]->ltd_active == active) {
+ CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+ uuid->uuid, active ? "" : "in");
+ goto out;
+ } else {
+ CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+ obd_uuid2str(uuid), active ? "" : "in");
+ }
+
+ lov->lov_tgts[index]->ltd_active = active;
+ if (active) {
+ lov->desc.ld_active_tgt_count++;
+ lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+ } else {
+ lov->desc.ld_active_tgt_count--;
+ lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+ }
+ } else {
+ CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid);
+ }
+
+ out:
+ obd_putref(obd);
+ return index;
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+ enum obd_notify_event ev, void *data)
+{
+ int rc = 0;
+ struct lov_obd *lov = &obd->u.lov;
+
+ down_read(&lov->lov_notify_lock);
+ if (!lov->lov_connects) {
+ up_read(&lov->lov_notify_lock);
+ return rc;
+ }
+
+ if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+ ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+ struct obd_uuid *uuid;
+
+ LASSERT(watched);
+
+ if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+ up_read(&lov->lov_notify_lock);
+ CERROR("unexpected notification of %s %s!\n",
+ watched->obd_type->typ_name,
+ watched->obd_name);
+ return -EINVAL;
+ }
+ uuid = &watched->u.cli.cl_target_uuid;
+
+ /* Set OSC as active before notifying the observer, so the
+ * observer can use the OSC normally.
+ */
+ rc = lov_set_osc_active(obd, uuid, ev);
+ if (rc < 0) {
+ up_read(&lov->lov_notify_lock);
+ CERROR("event(%d) of %s failed: %d\n", ev,
+ obd_uuid2str(uuid), rc);
+ return rc;
+ }
+ /* active event should be pass lov target index as data */
+ data = &rc;
+ }
+
+ /* Pass the notification up the chain. */
+ if (watched) {
+ rc = obd_notify_observer(obd, watched, ev, data);
+ } else {
+ /* NULL watched means all osc's in the lov (only for syncs) */
+ /* sync event should be send lov idx as data */
+ struct lov_obd *lov = &obd->u.lov;
+ int i, is_sync;
+
+ data = &i;
+ is_sync = (ev == OBD_NOTIFY_SYNC) ||
+ (ev == OBD_NOTIFY_SYNC_NONBLOCK);
+
+ obd_getref(obd);
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ if (!lov->lov_tgts[i])
+ continue;
+
+ /* don't send sync event if target not
+ * connected/activated */
+ if (is_sync && !lov->lov_tgts[i]->ltd_active)
+ continue;
+
+ rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd,
+ ev, data);
+ if (rc) {
+ CERROR("%s: notify %s of %s failed %d\n",
+ obd->obd_name,
+ obd->obd_observer->obd_name,
+ lov->lov_tgts[i]->ltd_obd->obd_name,
+ rc);
+ }
+ }
+ obd_putref(obd);
+ }
+
+ up_read(&lov->lov_notify_lock);
+ return rc;
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+ __u32 index, int gen, int active)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ struct lov_tgt_desc *tgt;
+ struct obd_device *tgt_obd;
+ int rc;
+
+ CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+ uuidp->uuid, index, gen, active);
+
+ if (gen <= 0) {
+ CERROR("request to add OBD %s with invalid generation: %d\n",
+ uuidp->uuid, gen);
+ return -EINVAL;
+ }
+
+ tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+ &obd->obd_uuid);
+ if (tgt_obd == NULL)
+ return -EINVAL;
+
+ mutex_lock(&lov->lov_lock);
+
+ if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+ tgt = lov->lov_tgts[index];
+ CERROR("UUID %s already assigned at LOV target index %d\n",
+ obd_uuid2str(&tgt->ltd_uuid), index);
+ mutex_unlock(&lov->lov_lock);
+ return -EEXIST;
+ }
+
+ if (index >= lov->lov_tgt_size) {
+ /* We need to reallocate the lov target array. */
+ struct lov_tgt_desc **newtgts, **old = NULL;
+ __u32 newsize, oldsize = 0;
+
+ newsize = max_t(__u32, lov->lov_tgt_size, 2);
+ while (newsize < index + 1)
+ newsize <<= 1;
+ OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+ if (newtgts == NULL) {
+ mutex_unlock(&lov->lov_lock);
+ return -ENOMEM;
+ }
+
+ if (lov->lov_tgt_size) {
+ memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+ lov->lov_tgt_size);
+ old = lov->lov_tgts;
+ oldsize = lov->lov_tgt_size;
+ }
+
+ lov->lov_tgts = newtgts;
+ lov->lov_tgt_size = newsize;
+ smp_rmb();
+ if (old)
+ OBD_FREE(old, sizeof(*old) * oldsize);
+
+ CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+ lov->lov_tgts, lov->lov_tgt_size);
+ }
+
+ OBD_ALLOC_PTR(tgt);
+ if (!tgt) {
+ mutex_unlock(&lov->lov_lock);
+ return -ENOMEM;
+ }
+
+ rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+ if (rc) {
+ mutex_unlock(&lov->lov_lock);
+ OBD_FREE_PTR(tgt);
+ return rc;
+ }
+
+ tgt->ltd_uuid = *uuidp;
+ tgt->ltd_obd = tgt_obd;
+ /* XXX - add a sanity check on the generation number. */
+ tgt->ltd_gen = gen;
+ tgt->ltd_index = index;
+ tgt->ltd_activate = active;
+ lov->lov_tgts[index] = tgt;
+ if (index >= lov->desc.ld_tgt_count)
+ lov->desc.ld_tgt_count = index + 1;
+
+ mutex_unlock(&lov->lov_lock);
+
+ CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+ index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+ rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index);
+
+ if (lov->lov_connects == 0) {
+ /* lov_connect hasn't been called yet. We'll do the
+ lov_connect_obd on this target when that fn first runs,
+ because we don't know the connect flags yet. */
+ return 0;
+ }
+
+ obd_getref(obd);
+
+ rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
+ if (rc)
+ goto out;
+
+ /* connect to administrative disabled ost */
+ if (!tgt->ltd_exp) {
+ rc = 0;
+ goto out;
+ }
+
+ if (lov->lov_cache != NULL) {
+ rc = obd_set_info_async(NULL, tgt->ltd_exp,
+ sizeof(KEY_CACHE_SET), KEY_CACHE_SET,
+ sizeof(struct cl_client_cache), lov->lov_cache,
+ NULL);
+ if (rc < 0)
+ goto out;
+ }
+
+ rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+ active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE,
+ (void *)&index);
+
+out:
+ if (rc) {
+ CERROR("add failed (%d), deleting %s\n", rc,
+ obd_uuid2str(&tgt->ltd_uuid));
+ lov_del_target(obd, index, NULL, 0);
+ }
+ obd_putref(obd);
+ return rc;
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, __u32 index,
+ struct obd_uuid *uuidp, int gen)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ int count = lov->desc.ld_tgt_count;
+ int rc = 0;
+
+ if (index >= count) {
+ CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+ index, count);
+ return -EINVAL;
+ }
+
+ /* to make sure there's no ongoing lov_notify() now */
+ down_write(&lov->lov_notify_lock);
+ obd_getref(obd);
+
+ if (!lov->lov_tgts[index]) {
+ CERROR("LOV target at index %d is not setup.\n", index);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+ CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+ lov_uuid2str(lov, index), index,
+ obd_uuid2str(uuidp));
+ rc = -EINVAL;
+ goto out;
+ }
+
+ CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+ lov_uuid2str(lov, index), index,
+ lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+ lov->lov_tgts[index]->ltd_active);
+
+ lov->lov_tgts[index]->ltd_reap = 1;
+ lov->lov_death_row++;
+ /* we really delete it from obd_putref */
+out:
+ obd_putref(obd);
+ up_write(&lov->lov_notify_lock);
+
+ return rc;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+ struct obd_device *osc_obd;
+
+ LASSERT(tgt);
+ LASSERT(tgt->ltd_reap);
+
+ osc_obd = class_exp2obd(tgt->ltd_exp);
+
+ CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+ tgt->ltd_uuid.uuid,
+ osc_obd ? osc_obd->obd_name : "<no obd>");
+
+ if (tgt->ltd_exp)
+ lov_disconnect_obd(obd, tgt);
+
+ OBD_FREE_PTR(tgt);
+
+ /* Manual cleanup - no cleanup logs to clean up the osc's. We must
+ do it ourselves. And we can't do it from lov_cleanup,
+ because we just lost our only reference to it. */
+ if (osc_obd)
+ class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+ if (*val < LOV_MIN_STRIPE_SIZE) {
+ if (*val != 0)
+ LCONSOLE_INFO("Increasing default stripe size to minimum %u\n",
+ LOV_DESC_STRIPE_SIZE_DEFAULT);
+ *val = LOV_DESC_STRIPE_SIZE_DEFAULT;
+ } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+ *val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+ LCONSOLE_WARN("Changing default stripe size to %llu (a multiple of %u)\n",
+ *val, LOV_MIN_STRIPE_SIZE);
+ }
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+ if (*val == 0)
+ *val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+ /* from lov_setstripe */
+ if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+ LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
+ *val = 0;
+ }
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+ if (*val == 0)
+ *val = LOV_DESC_QOS_MAXAGE_DEFAULT;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+ lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+ lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+ lov_fix_desc_pattern(&desc->ld_pattern);
+ lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct lprocfs_static_vars lvars = { NULL };
+ struct lov_desc *desc;
+ struct lov_obd *lov = &obd->u.lov;
+ int rc;
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+ CERROR("LOV setup requires a descriptor\n");
+ return -EINVAL;
+ }
+
+ desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+ if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+ CERROR("descriptor size wrong: %d > %d\n",
+ (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+ return -EINVAL;
+ }
+
+ if (desc->ld_magic != LOV_DESC_MAGIC) {
+ if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+ CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+ obd->obd_name, desc);
+ lustre_swab_lov_desc(desc);
+ } else {
+ CERROR("%s: Bad lov desc magic: %#x\n",
+ obd->obd_name, desc->ld_magic);
+ return -EINVAL;
+ }
+ }
+
+ lov_fix_desc(desc);
+
+ desc->ld_active_tgt_count = 0;
+ lov->desc = *desc;
+ lov->lov_tgt_size = 0;
+
+ mutex_init(&lov->lov_lock);
+ atomic_set(&lov->lov_refcount, 0);
+ lov->lov_sp_me = LUSTRE_SP_CLI;
+
+ init_rwsem(&lov->lov_notify_lock);
+
+ lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS,
+ HASH_POOLS_MAX_BITS,
+ HASH_POOLS_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &pool_hash_operations,
+ CFS_HASH_DEFAULT);
+ INIT_LIST_HEAD(&lov->lov_pool_list);
+ lov->lov_pool_count = 0;
+ rc = lov_ost_pool_init(&lov->lov_packed, 0);
+ if (rc)
+ goto out;
+
+ lprocfs_lov_init_vars(&lvars);
+ lprocfs_obd_setup(obd, lvars.obd_vars);
+#if defined (CONFIG_PROC_FS)
+ {
+ int rc1;
+
+ rc1 = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+ 0444, &lov_proc_target_fops, obd);
+ if (rc1)
+ CWARN("Error adding the target_obd file\n");
+ }
+#endif
+ lov->lov_pool_proc_entry = lprocfs_register("pools",
+ obd->obd_proc_entry,
+ NULL, NULL);
+
+ return 0;
+
+out:
+ return rc;
+}
+
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+ struct lov_obd *lov = &obd->u.lov;
+
+ switch (stage) {
+ case OBD_CLEANUP_EARLY: {
+ int i;
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+ continue;
+ obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp),
+ OBD_CLEANUP_EARLY);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ struct list_head *pos, *tmp;
+ struct pool_desc *pool;
+
+ list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+ pool = list_entry(pos, struct pool_desc, pool_list);
+ /* free pool structs */
+ CDEBUG(D_INFO, "delete pool %p\n", pool);
+ /* In the function below, .hs_keycmp resolves to
+ * pool_hashkey_keycmp() */
+ /* coverity[overrun-buffer-val] */
+ lov_pool_del(obd, pool->pool_name);
+ }
+ cfs_hash_putref(lov->lov_pools_hash_body);
+ lov_ost_pool_free(&lov->lov_packed);
+
+ lprocfs_obd_cleanup(obd);
+ if (lov->lov_tgts) {
+ int i;
+ obd_getref(obd);
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ if (!lov->lov_tgts[i])
+ continue;
+
+ /* Inactive targets may never have connected */
+ if (lov->lov_tgts[i]->ltd_active ||
+ atomic_read(&lov->lov_refcount))
+ /* We should never get here - these
+ should have been removed in the
+ disconnect. */
+ CERROR("lov tgt %d not cleaned! deathrow=%d, lovrc=%d\n",
+ i, lov->lov_death_row,
+ atomic_read(&lov->lov_refcount));
+ lov_del_target(obd, i, NULL, 0);
+ }
+ obd_putref(obd);
+ OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
+ lov->lov_tgt_size);
+ lov->lov_tgt_size = 0;
+ }
+ return 0;
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+ __u32 *indexp, int *genp)
+{
+ struct obd_uuid obd_uuid;
+ int cmd;
+ int rc = 0;
+
+ switch (cmd = lcfg->lcfg_command) {
+ case LCFG_LOV_ADD_OBD:
+ case LCFG_LOV_ADD_INA:
+ case LCFG_LOV_DEL_OBD: {
+ __u32 index;
+ int gen;
+ /* lov_modify_tgts add 0:lov_mdsA 1:ost1_UUID 2:0 3:1 */
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1));
+
+ if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ index = *indexp;
+ gen = *genp;
+ if (cmd == LCFG_LOV_ADD_OBD)
+ rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+ else if (cmd == LCFG_LOV_ADD_INA)
+ rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+ else
+ rc = lov_del_target(obd, index, &obd_uuid, gen);
+ goto out;
+ }
+ case LCFG_PARAM: {
+ struct lprocfs_static_vars lvars = { NULL };
+ struct lov_desc *desc = &(obd->u.lov.desc);
+
+ if (!desc) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ lprocfs_lov_init_vars(&lvars);
+
+ rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars,
+ lcfg, obd);
+ if (rc > 0)
+ rc = 0;
+ goto out;
+ }
+ case LCFG_POOL_NEW:
+ case LCFG_POOL_ADD:
+ case LCFG_POOL_DEL:
+ case LCFG_POOL_REM:
+ goto out;
+
+ default: {
+ CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+ rc = -EINVAL;
+ goto out;
+
+ }
+ }
+out:
+ return rc;
+}
+
+static int lov_recreate(struct obd_export *exp, struct obdo *src_oa,
+ struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+ struct lov_stripe_md *obj_mdp, *lsm;
+ struct lov_obd *lov = &exp->exp_obd->u.lov;
+ unsigned ost_idx;
+ int rc, i;
+
+ LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS &&
+ src_oa->o_flags & OBD_FL_RECREATE_OBJS);
+
+ OBD_ALLOC(obj_mdp, sizeof(*obj_mdp));
+ if (obj_mdp == NULL)
+ return -ENOMEM;
+
+ ost_idx = src_oa->o_nlink;
+ lsm = *ea;
+ if (lsm == NULL) {
+ rc = -EINVAL;
+ goto out;
+ }
+ if (ost_idx >= lov->desc.ld_tgt_count ||
+ !lov->lov_tgts[ost_idx]) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (loi->loi_ost_idx == ost_idx) {
+ if (ostid_id(&loi->loi_oi) != ostid_id(&src_oa->o_oi)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ break;
+ }
+ }
+ if (i == lsm->lsm_stripe_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp,
+ src_oa, &obj_mdp, oti);
+out:
+ OBD_FREE(obj_mdp, sizeof(*obj_mdp));
+ return rc;
+}
+
+/* the LOV expects oa->o_id to be set to the LOV object id */
+static int lov_create(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *src_oa, struct lov_stripe_md **ea,
+ struct obd_trans_info *oti)
+{
+ struct lov_obd *lov;
+ int rc = 0;
+
+ LASSERT(ea != NULL);
+ if (exp == NULL)
+ return -EINVAL;
+
+ if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+ src_oa->o_flags == OBD_FL_DELORPHAN) {
+ /* should be used with LOV anymore */
+ LBUG();
+ }
+
+ lov = &exp->exp_obd->u.lov;
+ if (!lov->desc.ld_active_tgt_count)
+ return -EIO;
+
+ obd_getref(exp->exp_obd);
+ /* Recreate a specific object id at the given OST index */
+ if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+ (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+ rc = lov_recreate(exp, src_oa, ea, oti);
+ }
+
+ obd_putref(exp->exp_obd);
+ return rc;
+}
+
+#define ASSERT_LSM_MAGIC(lsmp) \
+do { \
+ LASSERT((lsmp) != NULL); \
+ LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 || \
+ (lsmp)->lsm_magic == LOV_MAGIC_V3), \
+ "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic); \
+} while (0)
+
+static int lov_destroy(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md *lsm,
+ struct obd_trans_info *oti, struct obd_export *md_exp,
+ void *capa)
+{
+ struct lov_request_set *set;
+ struct obd_info oinfo;
+ struct lov_request *req;
+ struct list_head *pos;
+ struct lov_obd *lov;
+ int rc = 0, err = 0;
+
+ ASSERT_LSM_MAGIC(lsm);
+
+ if (!exp || !exp->exp_obd)
+ return -ENODEV;
+
+ if (oa->o_valid & OBD_MD_FLCOOKIE) {
+ LASSERT(oti);
+ LASSERT(oti->oti_logcookies);
+ }
+
+ lov = &exp->exp_obd->u.lov;
+ obd_getref(exp->exp_obd);
+ rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
+ if (rc)
+ goto out;
+
+ list_for_each(pos, &set->set_list) {
+ req = list_entry(pos, struct lov_request, rq_link);
+
+ if (oa->o_valid & OBD_MD_FLCOOKIE)
+ oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+ err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+ req->rq_oi.oi_oa, NULL, oti, NULL, capa);
+ err = lov_update_common_set(set, req, err);
+ if (err) {
+ CERROR("%s: destroying objid "DOSTID" subobj "
+ DOSTID" on OST idx %d: rc = %d\n",
+ exp->exp_obd->obd_name, POSTID(&oa->o_oi),
+ POSTID(&req->rq_oi.oi_oa->o_oi),
+ req->rq_idx, err);
+ if (!rc)
+ rc = err;
+ }
+ }
+
+ if (rc == 0) {
+ LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+ rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+ }
+ err = lov_fini_destroy_set(set);
+out:
+ obd_putref(exp->exp_obd);
+ return rc ? rc : err;
+}
+
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+ void *data, int rc)
+{
+ struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
+
+ /* don't do attribute merge if this async op failed */
+ if (rc)
+ atomic_set(&lovset->set_completes, 0);
+ err = lov_fini_getattr_set(lovset);
+ return rc ? rc : err;
+}
+
+static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+ struct ptlrpc_request_set *rqset)
+{
+ struct lov_request_set *lovset;
+ struct lov_obd *lov;
+ struct list_head *pos;
+ struct lov_request *req;
+ int rc = 0, err;
+
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+ if (!exp || !exp->exp_obd)
+ return -ENODEV;
+
+ lov = &exp->exp_obd->u.lov;
+
+ rc = lov_prep_getattr_set(exp, oinfo, &lovset);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+ POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count,
+ oinfo->oi_md->lsm_stripe_size);
+
+ list_for_each(pos, &lovset->set_list) {
+ req = list_entry(pos, struct lov_request, rq_link);
+
+ CDEBUG(D_INFO, "objid " DOSTID "[%d] has subobj " DOSTID " at idx%u\n",
+ POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+ POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+ rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+ &req->rq_oi, rqset);
+ if (rc) {
+ CERROR("%s: getattr objid "DOSTID" subobj"
+ DOSTID" on OST idx %d: rc = %d\n",
+ exp->exp_obd->obd_name,
+ POSTID(&oinfo->oi_oa->o_oi),
+ POSTID(&req->rq_oi.oi_oa->o_oi),
+ req->rq_idx, rc);
+ goto out;
+ }
+ }
+
+ if (!list_empty(&rqset->set_requests)) {
+ LASSERT(rc == 0);
+ LASSERT(rqset->set_interpret == NULL);
+ rqset->set_interpret = lov_getattr_interpret;
+ rqset->set_arg = (void *)lovset;
+ return rc;
+ }
+out:
+ if (rc)
+ atomic_set(&lovset->set_completes, 0);
+ err = lov_fini_getattr_set(lovset);
+ return rc ? rc : err;
+}
+
+static int lov_setattr_interpret(struct ptlrpc_request_set *rqset,
+ void *data, int rc)
+{
+ struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
+
+ if (rc)
+ atomic_set(&lovset->set_completes, 0);
+ err = lov_fini_setattr_set(lovset);
+ return rc ? rc : err;
+}
+
+/* If @oti is given, the request goes from MDS and responses from OSTs are not
+ needed. Otherwise, a client is waiting for responses. */
+static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
+{
+ struct lov_request_set *set;
+ struct lov_request *req;
+ struct list_head *pos;
+ struct lov_obd *lov;
+ int rc = 0;
+
+ LASSERT(oinfo);
+ ASSERT_LSM_MAGIC(oinfo->oi_md);
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+ LASSERT(oti);
+ LASSERT(oti->oti_logcookies);
+ }
+
+ if (!exp || !exp->exp_obd)
+ return -ENODEV;
+
+ lov = &exp->exp_obd->u.lov;
+ rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+ POSTID(&oinfo->oi_md->lsm_oi),
+ oinfo->oi_md->lsm_stripe_count,
+ oinfo->oi_md->lsm_stripe_size);
+
+ list_for_each(pos, &set->set_list) {
+ req = list_entry(pos, struct lov_request, rq_link);
+
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+ oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+ CDEBUG(D_INFO, "objid " DOSTID "[%d] has subobj " DOSTID " at idx%u\n",
+ POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+ POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+ rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+ &req->rq_oi, oti, rqset);
+ if (rc) {
+ CERROR("error: setattr objid "DOSTID" subobj"
+ DOSTID" on OST idx %d: rc = %d\n",
+ POSTID(&set->set_oi->oi_oa->o_oi),
+ POSTID(&req->rq_oi.oi_oa->o_oi),
+ req->rq_idx, rc);
+ break;
+ }
+ }
+
+ /* If we are not waiting for responses on async requests, return. */
+ if (rc || !rqset || list_empty(&rqset->set_requests)) {
+ int err;
+ if (rc)
+ atomic_set(&set->set_completes, 0);
+ err = lov_fini_setattr_set(set);
+ return rc ? rc : err;
+ }
+
+ LASSERT(rqset->set_interpret == NULL);
+ rqset->set_interpret = lov_setattr_interpret;
+ rqset->set_arg = (void *)set;
+
+ return 0;
+}
+
+/* find any ldlm lock of the inode in lov
+ * return 0 not find
+ * 1 find one
+ * < 0 error */
+static int lov_find_cbdata(struct obd_export *exp,
+ struct lov_stripe_md *lsm, ldlm_iterator_t it,
+ void *data)
+{
+ struct lov_obd *lov;
+ int rc = 0, i;
+
+ ASSERT_LSM_MAGIC(lsm);
+
+ if (!exp || !exp->exp_obd)
+ return -ENODEV;
+
+ lov = &exp->exp_obd->u.lov;
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ struct lov_stripe_md submd;
+ struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (!lov->lov_tgts[loi->loi_ost_idx]) {
+ CDEBUG(D_HA, "lov idx %d NULL\n", loi->loi_ost_idx);
+ continue;
+ }
+
+ submd.lsm_oi = loi->loi_oi;
+ submd.lsm_stripe_count = 0;
+ rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+ &submd, it, data);
+ if (rc != 0)
+ return rc;
+ }
+ return rc;
+}
+
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+ struct lov_request_set *lovset = (struct lov_request_set *)data;
+ int err;
+
+ if (rc)
+ atomic_set(&lovset->set_completes, 0);
+
+ err = lov_fini_statfs_set(lovset);
+ return rc ? rc : err;
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+ __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lov_request_set *set;
+ struct lov_request *req;
+ struct list_head *pos;
+ struct lov_obd *lov;
+ int rc = 0;
+
+ LASSERT(oinfo != NULL);
+ LASSERT(oinfo->oi_osfs != NULL);
+
+ lov = &obd->u.lov;
+ rc = lov_prep_statfs_set(obd, oinfo, &set);
+ if (rc)
+ return rc;
+
+ list_for_each(pos, &set->set_list) {
+ req = list_entry(pos, struct lov_request, rq_link);
+ rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+ &req->rq_oi, max_age, rqset);
+ if (rc)
+ break;
+ }
+
+ if (rc || list_empty(&rqset->set_requests)) {
+ int err;
+ if (rc)
+ atomic_set(&set->set_completes, 0);
+ err = lov_fini_statfs_set(set);
+ return rc ? rc : err;
+ }
+
+ LASSERT(rqset->set_interpret == NULL);
+ rqset->set_interpret = lov_statfs_interpret;
+ rqset->set_arg = (void *)set;
+ return 0;
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+ struct ptlrpc_request_set *set = NULL;
+ struct obd_info oinfo = { { { 0 } } };
+ int rc = 0;
+
+ /* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+ * statfs requests */
+ set = ptlrpc_prep_set();
+ if (set == NULL)
+ return -ENOMEM;
+
+ oinfo.oi_osfs = osfs;
+ oinfo.oi_flags = flags;
+ rc = lov_statfs_async(exp, &oinfo, max_age, set);
+ if (rc == 0)
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+
+ return rc;
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+ void *karg, void *uarg)
+{
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct lov_obd *lov = &obddev->u.lov;
+ int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+ struct obd_uuid *uuidp;
+
+ switch (cmd) {
+ case IOC_OBD_STATFS: {
+ struct obd_ioctl_data *data = karg;
+ struct obd_device *osc_obd;
+ struct obd_statfs stat_buf = {0};
+ __u32 index;
+ __u32 flags;
+
+ memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+ if ((index >= count))
+ return -ENODEV;
+
+ if (!lov->lov_tgts[index])
+ /* Try again with the next index */
+ return -EAGAIN;
+ if (!lov->lov_tgts[index]->ltd_active)
+ return -ENODATA;
+
+ osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+ if (!osc_obd)
+ return -EINVAL;
+
+ /* copy UUID */
+ if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+ min((int) data->ioc_plen2,
+ (int) sizeof(struct obd_uuid))))
+ return -EFAULT;
+
+ flags = uarg ? *(__u32 *)uarg : 0;
+ /* got statfs data */
+ rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ flags);
+ if (rc)
+ return rc;
+ if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+ min((int) data->ioc_plen1,
+ (int) sizeof(stat_buf))))
+ return -EFAULT;
+ break;
+ }
+ case OBD_IOC_LOV_GET_CONFIG: {
+ struct obd_ioctl_data *data;
+ struct lov_desc *desc;
+ char *buf = NULL;
+ __u32 *genp;
+
+ len = 0;
+ if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+ return -EINVAL;
+
+ data = (struct obd_ioctl_data *)buf;
+
+ if (sizeof(*desc) > data->ioc_inllen1) {
+ obd_ioctl_freedata(buf, len);
+ return -EINVAL;
+ }
+
+ if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) {
+ obd_ioctl_freedata(buf, len);
+ return -EINVAL;
+ }
+
+ if (sizeof(__u32) * count > data->ioc_inllen3) {
+ obd_ioctl_freedata(buf, len);
+ return -EINVAL;
+ }
+
+ desc = (struct lov_desc *)data->ioc_inlbuf1;
+ memcpy(desc, &(lov->desc), sizeof(*desc));
+
+ uuidp = (struct obd_uuid *)data->ioc_inlbuf2;
+ genp = (__u32 *)data->ioc_inlbuf3;
+ /* the uuid will be empty for deleted OSTs */
+ for (i = 0; i < count; i++, uuidp++, genp++) {
+ if (!lov->lov_tgts[i])
+ continue;
+ *uuidp = lov->lov_tgts[i]->ltd_uuid;
+ *genp = lov->lov_tgts[i]->ltd_gen;
+ }
+
+ if (copy_to_user((void *)uarg, buf, len))
+ rc = -EFAULT;
+ obd_ioctl_freedata(buf, len);
+ break;
+ }
+ case LL_IOC_LOV_GETSTRIPE:
+ rc = lov_getstripe(exp, karg, uarg);
+ break;
+ case OBD_IOC_QUOTACTL: {
+ struct if_quotactl *qctl = karg;
+ struct lov_tgt_desc *tgt = NULL;
+ struct obd_quotactl *oqctl;
+
+ if (qctl->qc_valid == QC_OSTIDX) {
+ if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+ return -EINVAL;
+
+ tgt = lov->lov_tgts[qctl->qc_idx];
+ if (!tgt || !tgt->ltd_exp)
+ return -EINVAL;
+ } else if (qctl->qc_valid == QC_UUID) {
+ for (i = 0; i < count; i++) {
+ tgt = lov->lov_tgts[i];
+ if (!tgt ||
+ !obd_uuid_equals(&tgt->ltd_uuid,
+ &qctl->obd_uuid))
+ continue;
+
+ if (tgt->ltd_exp == NULL)
+ return -EINVAL;
+
+ break;
+ }
+ } else {
+ return -EINVAL;
+ }
+
+ if (i >= count)
+ return -EAGAIN;
+
+ LASSERT(tgt && tgt->ltd_exp);
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl)
+ return -ENOMEM;
+
+ QCTL_COPY(oqctl, qctl);
+ rc = obd_quotactl(tgt->ltd_exp, oqctl);
+ if (rc == 0) {
+ QCTL_COPY(qctl, oqctl);
+ qctl->qc_valid = QC_OSTIDX;
+ qctl->obd_uuid = tgt->ltd_uuid;
+ }
+ OBD_FREE_PTR(oqctl);
+ break;
+ }
+ default: {
+ int set = 0;
+
+ if (count == 0)
+ return -ENOTTY;
+
+ for (i = 0; i < count; i++) {
+ int err;
+ struct obd_device *osc_obd;
+
+ /* OST was disconnected */
+ if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+ continue;
+
+ /* ll_umount_begin() sets force flag but for lov, not
+ * osc. Let's pass it through */
+ osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+ osc_obd->obd_force = obddev->obd_force;
+ err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+ len, karg, uarg);
+ if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+ return err;
+ } else if (err) {
+ if (lov->lov_tgts[i]->ltd_active) {
+ CDEBUG(err == -ENOTTY ?
+ D_IOCTL : D_WARNING,
+ "iocontrol OSC %s on OST idx %d cmd %x: err = %d\n",
+ lov_uuid2str(lov, i),
+ i, cmd, err);
+ if (!rc)
+ rc = err;
+ }
+ } else {
+ set = 1;
+ }
+ }
+ if (!set && !rc)
+ rc = -EIO;
+ }
+ }
+
+ return rc;
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap fiemap request header
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe will be returned in this
+ */
+static u64 fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap,
+ struct lov_stripe_md *lsm, u64 fm_start,
+ u64 fm_end, int *start_stripe)
+{
+ u64 local_end = fiemap->fm_extents[0].fe_logical;
+ u64 lun_start, lun_end;
+ u64 fm_end_offset;
+ int stripe_no = -1, i;
+
+ if (fiemap->fm_extent_count == 0 ||
+ fiemap->fm_extents[0].fe_logical == 0)
+ return 0;
+
+ /* Find out stripe_no from ost_index saved in the fe_device */
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+
+ if (lov_oinfo_is_dummy(oinfo))
+ continue;
+
+ if (oinfo->loi_ost_idx == fiemap->fm_extents[0].fe_device) {
+ stripe_no = i;
+ break;
+ }
+ }
+ if (stripe_no == -1)
+ return -EINVAL;
+
+ /* If we have finished mapping on previous device, shift logical
+ * offset to start of next device */
+ if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end,
+ &lun_start, &lun_end)) != 0 &&
+ local_end < lun_end) {
+ fm_end_offset = local_end;
+ *start_stripe = stripe_no;
+ } else {
+ /* This is a special value to indicate that caller should
+ * calculate offset in next stripe. */
+ fm_end_offset = 0;
+ *start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count;
+ }
+
+ return fm_end_offset;
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe of the mapping
+ * \param stripe_count the number of stripes across which to map is returned
+ *
+ * \retval last_stripe return the last stripe of the mapping
+ */
+static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, u64 fm_start,
+ u64 fm_end, int start_stripe,
+ int *stripe_count)
+{
+ int last_stripe;
+ u64 obd_start, obd_end;
+ int i, j;
+
+ if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) {
+ last_stripe = start_stripe < 1 ? lsm->lsm_stripe_count - 1 :
+ start_stripe - 1;
+ *stripe_count = lsm->lsm_stripe_count;
+ } else {
+ for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count;
+ i = (i + 1) % lsm->lsm_stripe_count, j++) {
+ if ((lov_stripe_intersects(lsm, i, fm_start, fm_end,
+ &obd_start, &obd_end)) == 0)
+ break;
+ }
+ *stripe_count = j;
+ last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count;
+ }
+
+ return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap fiemap request header
+ * \param lcl_fm_ext array of local fiemap extents to be copied
+ * \param ost_index OST index to be written into the fm_device field for each
+ extent
+ * \param ext_count number of extents to be copied
+ * \param current_extent where to start copying in main extent array
+ */
+static void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap,
+ struct ll_fiemap_extent *lcl_fm_ext,
+ int ost_index, unsigned int ext_count,
+ int current_extent)
+{
+ char *to;
+ int ext;
+
+ for (ext = 0; ext < ext_count; ext++) {
+ lcl_fm_ext[ext].fe_device = ost_index;
+ lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+ }
+
+ /* Copy fm_extent's from fm_local to return buffer */
+ to = (char *)fiemap + fiemap_count_to_size(current_extent);
+ memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent));
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ */
+static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key,
+ __u32 *vallen, void *val, struct lov_stripe_md *lsm)
+{
+ struct ll_fiemap_info_key *fm_key = key;
+ struct ll_user_fiemap *fiemap = val;
+ struct ll_user_fiemap *fm_local = NULL;
+ struct ll_fiemap_extent *lcl_fm_ext;
+ int count_local;
+ unsigned int get_num_extents = 0;
+ int ost_index = 0, actual_start_stripe, start_stripe;
+ u64 fm_start, fm_end, fm_length, fm_end_offset;
+ u64 curr_loc;
+ int current_extent = 0, rc = 0, i;
+ int ost_eof = 0; /* EOF for object */
+ int ost_done = 0; /* done with required mapping for this OST? */
+ int last_stripe;
+ int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count;
+ unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+
+ if (!lsm_has_objects(lsm)) {
+ rc = 0;
+ goto out;
+ }
+
+ if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size)
+ buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count);
+
+ OBD_ALLOC_LARGE(fm_local, buffer_size);
+ if (fm_local == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ lcl_fm_ext = &fm_local->fm_extents[0];
+
+ count_local = fiemap_size_to_count(buffer_size);
+
+ memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+ fm_start = fiemap->fm_start;
+ fm_length = fiemap->fm_length;
+ /* Calculate start stripe, last stripe and length of mapping */
+ actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start);
+ fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size :
+ fm_start + fm_length - 1);
+ /* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */
+ if (fm_end > fm_key->oa.o_size)
+ fm_end = fm_key->oa.o_size;
+
+ last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end,
+ actual_start_stripe, &stripe_count);
+
+ fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start,
+ fm_end, &start_stripe);
+ if (fm_end_offset == -EINVAL) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (fiemap_count_to_size(fiemap->fm_extent_count) > *vallen)
+ fiemap->fm_extent_count = fiemap_size_to_count(*vallen);
+ if (fiemap->fm_extent_count == 0) {
+ get_num_extents = 1;
+ count_local = 0;
+ }
+ /* Check each stripe */
+ for (cur_stripe = start_stripe, i = 0; i < stripe_count;
+ i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) {
+ u64 req_fm_len; /* Stores length of required mapping */
+ u64 len_mapped_single_call;
+ u64 lun_start, lun_end, obd_object_end;
+ unsigned int ext_count;
+
+ cur_stripe_wrap = cur_stripe;
+
+ /* Find out range of mapping on this stripe */
+ if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end,
+ &lun_start, &obd_object_end)) == 0)
+ continue;
+
+ if (lov_oinfo_is_dummy(lsm->lsm_oinfo[cur_stripe])) {
+ rc = -EIO;
+ goto out;
+ }
+
+ /* If this is a continuation FIEMAP call and we are on
+ * starting stripe then lun_start needs to be set to
+ * fm_end_offset */
+ if (fm_end_offset != 0 && cur_stripe == start_stripe)
+ lun_start = fm_end_offset;
+
+ if (fm_length != ~0ULL) {
+ /* Handle fm_start + fm_length overflow */
+ if (fm_start + fm_length < fm_start)
+ fm_length = ~0ULL - fm_start;
+ lun_end = lov_size_to_stripe(lsm, fm_start + fm_length,
+ cur_stripe);
+ } else {
+ lun_end = ~0ULL;
+ }
+
+ if (lun_start == lun_end)
+ continue;
+
+ req_fm_len = obd_object_end - lun_start;
+ fm_local->fm_length = 0;
+ len_mapped_single_call = 0;
+
+ /* If the output buffer is very large and the objects have many
+ * extents we may need to loop on a single OST repeatedly */
+ ost_eof = 0;
+ ost_done = 0;
+ do {
+ if (get_num_extents == 0) {
+ /* Don't get too many extents. */
+ if (current_extent + count_local >
+ fiemap->fm_extent_count)
+ count_local = fiemap->fm_extent_count -
+ current_extent;
+ }
+
+ lun_start += len_mapped_single_call;
+ fm_local->fm_length = req_fm_len - len_mapped_single_call;
+ req_fm_len = fm_local->fm_length;
+ fm_local->fm_extent_count = count_local;
+ fm_local->fm_mapped_extents = 0;
+ fm_local->fm_flags = fiemap->fm_flags;
+
+ fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi;
+ ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx;
+
+ if (ost_index < 0 ||
+ ost_index >= lov->desc.ld_tgt_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* If OST is inactive, return extent with UNKNOWN flag */
+ if (!lov->lov_tgts[ost_index]->ltd_active) {
+ fm_local->fm_flags |= FIEMAP_EXTENT_LAST;
+ fm_local->fm_mapped_extents = 1;
+
+ lcl_fm_ext[0].fe_logical = lun_start;
+ lcl_fm_ext[0].fe_length = obd_object_end -
+ lun_start;
+ lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+ goto inactive_tgt;
+ }
+
+ fm_local->fm_start = lun_start;
+ fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+ memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local));
+ *vallen=fiemap_count_to_size(fm_local->fm_extent_count);
+ rc = obd_get_info(NULL,
+ lov->lov_tgts[ost_index]->ltd_exp,
+ keylen, key, vallen, fm_local, lsm);
+ if (rc != 0)
+ goto out;
+
+inactive_tgt:
+ ext_count = fm_local->fm_mapped_extents;
+ if (ext_count == 0) {
+ ost_done = 1;
+ /* If last stripe has hole at the end,
+ * then we need to return */
+ if (cur_stripe_wrap == last_stripe) {
+ fiemap->fm_mapped_extents = 0;
+ goto finish;
+ }
+ break;
+ }
+
+ /* If we just need num of extents then go to next device */
+ if (get_num_extents) {
+ current_extent += ext_count;
+ break;
+ }
+
+ len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical -
+ lun_start + lcl_fm_ext[ext_count - 1].fe_length;
+
+ /* Have we finished mapping on this device? */
+ if (req_fm_len <= len_mapped_single_call)
+ ost_done = 1;
+
+ /* Clear the EXTENT_LAST flag which can be present on
+ * last extent */
+ if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST)
+ lcl_fm_ext[ext_count - 1].fe_flags &=
+ ~FIEMAP_EXTENT_LAST;
+
+ curr_loc = lov_stripe_size(lsm,
+ lcl_fm_ext[ext_count - 1].fe_logical+
+ lcl_fm_ext[ext_count - 1].fe_length,
+ cur_stripe);
+ if (curr_loc >= fm_key->oa.o_size)
+ ost_eof = 1;
+
+ fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext,
+ ost_index, ext_count,
+ current_extent);
+
+ current_extent += ext_count;
+
+ /* Ran out of available extents? */
+ if (current_extent >= fiemap->fm_extent_count)
+ goto finish;
+ } while (ost_done == 0 && ost_eof == 0);
+
+ if (cur_stripe_wrap == last_stripe)
+ goto finish;
+ }
+
+finish:
+ /* Indicate that we are returning device offsets unless file just has
+ * single stripe */
+ if (lsm->lsm_stripe_count > 1)
+ fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+ if (get_num_extents)
+ goto skip_last_device_calc;
+
+ /* Check if we have reached the last stripe and whether mapping for that
+ * stripe is done. */
+ if (cur_stripe_wrap == last_stripe) {
+ if (ost_done || ost_eof)
+ fiemap->fm_extents[current_extent - 1].fe_flags |=
+ FIEMAP_EXTENT_LAST;
+ }
+
+skip_last_device_calc:
+ fiemap->fm_mapped_extents = current_extent;
+
+out:
+ OBD_FREE_LARGE(fm_local, buffer_size);
+ return rc;
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+ __u32 keylen, void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm)
+{
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct lov_obd *lov = &obddev->u.lov;
+ int i, rc;
+
+ if (!vallen || !val)
+ return -EFAULT;
+
+ obd_getref(obddev);
+
+ if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+ struct {
+ char name[16];
+ struct ldlm_lock *lock;
+ } *data = key;
+ struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
+ struct lov_oinfo *loi;
+ __u32 *stripe = val;
+
+ if (*vallen < sizeof(*stripe)) {
+ rc = -EFAULT;
+ goto out;
+ }
+ *vallen = sizeof(*stripe);
+
+ /* XXX This is another one of those bits that will need to
+ * change if we ever actually support nested LOVs. It uses
+ * the lock's export to find out which stripe it is. */
+ /* XXX - it's assumed all the locks for deleted OSTs have
+ * been cancelled. Also, the export for deleted OSTs will
+ * be NULL and won't match the lock's export. */
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ loi = lsm->lsm_oinfo[i];
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (!lov->lov_tgts[loi->loi_ost_idx])
+ continue;
+ if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp ==
+ data->lock->l_conn_export &&
+ ostid_res_name_eq(&loi->loi_oi, res_id)) {
+ *stripe = i;
+ rc = 0;
+ goto out;
+ }
+ }
+ LDLM_ERROR(data->lock, "lock on inode without such object");
+ dump_lsm(D_ERROR, lsm);
+ rc = -ENXIO;
+ goto out;
+ } else if (KEY_IS(KEY_LAST_ID)) {
+ struct obd_id_info *info = val;
+ __u32 size = sizeof(u64);
+ struct lov_tgt_desc *tgt;
+
+ LASSERT(*vallen == sizeof(struct obd_id_info));
+ tgt = lov->lov_tgts[info->idx];
+
+ if (!tgt || !tgt->ltd_active) {
+ rc = -ESRCH;
+ goto out;
+ }
+
+ rc = obd_get_info(env, tgt->ltd_exp, keylen, key,
+ &size, info->data, NULL);
+ rc = 0;
+ goto out;
+ } else if (KEY_IS(KEY_LOVDESC)) {
+ struct lov_desc *desc_ret = val;
+ *desc_ret = lov->desc;
+
+ rc = 0;
+ goto out;
+ } else if (KEY_IS(KEY_FIEMAP)) {
+ rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
+ goto out;
+ } else if (KEY_IS(KEY_CONNECT_FLAG)) {
+ struct lov_tgt_desc *tgt;
+ __u64 ost_idx = *((__u64 *)val);
+
+ LASSERT(*vallen == sizeof(__u64));
+ LASSERT(ost_idx < lov->desc.ld_tgt_count);
+ tgt = lov->lov_tgts[ost_idx];
+
+ if (!tgt || !tgt->ltd_exp) {
+ rc = -ESRCH;
+ goto out;
+ }
+
+ *((__u64 *)val) = exp_connect_flags(tgt->ltd_exp);
+ rc = 0;
+ goto out;
+ } else if (KEY_IS(KEY_TGT_COUNT)) {
+ *((int *)val) = lov->desc.ld_tgt_count;
+ rc = 0;
+ goto out;
+ }
+
+ rc = -EINVAL;
+
+out:
+ obd_putref(obddev);
+ return rc;
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, u32 vallen,
+ void *val, struct ptlrpc_request_set *set)
+{
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct lov_obd *lov = &obddev->u.lov;
+ u32 count;
+ int i, rc = 0, err;
+ struct lov_tgt_desc *tgt;
+ unsigned incr, check_uuid,
+ do_inactive, no_set;
+ unsigned next_id = 0, mds_con = 0, capa = 0;
+
+ incr = check_uuid = do_inactive = no_set = 0;
+ if (set == NULL) {
+ no_set = 1;
+ set = ptlrpc_prep_set();
+ if (!set)
+ return -ENOMEM;
+ }
+
+ obd_getref(obddev);
+ count = lov->desc.ld_tgt_count;
+
+ if (KEY_IS(KEY_NEXT_ID)) {
+ count = vallen / sizeof(struct obd_id_info);
+ vallen = sizeof(u64);
+ incr = sizeof(struct obd_id_info);
+ do_inactive = 1;
+ next_id = 1;
+ } else if (KEY_IS(KEY_CHECKSUM)) {
+ do_inactive = 1;
+ } else if (KEY_IS(KEY_EVICT_BY_NID)) {
+ /* use defaults: do_inactive = incr = 0; */
+ } else if (KEY_IS(KEY_MDS_CONN)) {
+ mds_con = 1;
+ } else if (KEY_IS(KEY_CAPA_KEY)) {
+ capa = 1;
+ } else if (KEY_IS(KEY_CACHE_SET)) {
+ LASSERT(lov->lov_cache == NULL);
+ lov->lov_cache = val;
+ do_inactive = 1;
+ }
+
+ for (i = 0; i < count; i++, val = (char *)val + incr) {
+ if (next_id) {
+ tgt = lov->lov_tgts[((struct obd_id_info *)val)->idx];
+ } else {
+ tgt = lov->lov_tgts[i];
+ }
+ /* OST was disconnected */
+ if (!tgt || !tgt->ltd_exp)
+ continue;
+
+ /* OST is inactive and we don't want inactive OSCs */
+ if (!tgt->ltd_active && !do_inactive)
+ continue;
+
+ if (mds_con) {
+ struct mds_group_info *mgi;
+
+ LASSERT(vallen == sizeof(*mgi));
+ mgi = (struct mds_group_info *)val;
+
+ /* Only want a specific OSC */
+ if (mgi->uuid && !obd_uuid_equals(mgi->uuid,
+ &tgt->ltd_uuid))
+ continue;
+
+ err = obd_set_info_async(env, tgt->ltd_exp,
+ keylen, key, sizeof(int),
+ &mgi->group, set);
+ } else if (next_id) {
+ err = obd_set_info_async(env, tgt->ltd_exp,
+ keylen, key, vallen,
+ ((struct obd_id_info *)val)->data, set);
+ } else if (capa) {
+ struct mds_capa_info *info = (struct mds_capa_info *)val;
+
+ LASSERT(vallen == sizeof(*info));
+
+ /* Only want a specific OSC */
+ if (info->uuid &&
+ !obd_uuid_equals(info->uuid, &tgt->ltd_uuid))
+ continue;
+
+ err = obd_set_info_async(env, tgt->ltd_exp, keylen,
+ key, sizeof(*info->capa),
+ info->capa, set);
+ } else {
+ /* Only want a specific OSC */
+ if (check_uuid &&
+ !obd_uuid_equals(val, &tgt->ltd_uuid))
+ continue;
+
+ err = obd_set_info_async(env, tgt->ltd_exp,
+ keylen, key, vallen, val, set);
+ }
+
+ if (!rc)
+ rc = err;
+ }
+
+ obd_putref(obddev);
+ if (no_set) {
+ err = ptlrpc_set_wait(set);
+ if (!rc)
+ rc = err;
+ ptlrpc_set_destroy(set);
+ }
+ return rc;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+ __acquires(&md->lsm_lock)
+{
+ LASSERT(md->lsm_lock_owner != current_pid());
+ spin_lock(&md->lsm_lock);
+ LASSERT(md->lsm_lock_owner == 0);
+ md->lsm_lock_owner = current_pid();
+}
+EXPORT_SYMBOL(lov_stripe_lock);
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+ __releases(&md->lsm_lock)
+{
+ LASSERT(md->lsm_lock_owner == current_pid());
+ md->lsm_lock_owner = 0;
+ spin_unlock(&md->lsm_lock);
+}
+EXPORT_SYMBOL(lov_stripe_unlock);
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ struct lov_tgt_desc *tgt;
+ __u64 curspace = 0;
+ __u64 bhardlimit = 0;
+ int i, rc = 0;
+
+ if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON &&
+ oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF &&
+ oqctl->qc_cmd != Q_GETOQUOTA &&
+ oqctl->qc_cmd != Q_INITQUOTA &&
+ oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+ oqctl->qc_cmd != Q_FINVALIDATE) {
+ CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
+ return -EFAULT;
+ }
+
+ /* for lov tgt */
+ obd_getref(obd);
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ int err;
+
+ tgt = lov->lov_tgts[i];
+
+ if (!tgt)
+ continue;
+
+ if (!tgt->ltd_active || tgt->ltd_reap) {
+ if (oqctl->qc_cmd == Q_GETOQUOTA &&
+ lov->lov_tgts[i]->ltd_activate) {
+ rc = -EREMOTEIO;
+ CERROR("ost %d is inactive\n", i);
+ } else {
+ CDEBUG(D_HA, "ost %d is inactive\n", i);
+ }
+ continue;
+ }
+
+ err = obd_quotactl(tgt->ltd_exp, oqctl);
+ if (err) {
+ if (tgt->ltd_active && !rc)
+ rc = err;
+ continue;
+ }
+
+ if (oqctl->qc_cmd == Q_GETOQUOTA) {
+ curspace += oqctl->qc_dqblk.dqb_curspace;
+ bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+ }
+ }
+ obd_putref(obd);
+
+ if (oqctl->qc_cmd == Q_GETOQUOTA) {
+ oqctl->qc_dqblk.dqb_curspace = curspace;
+ oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+ }
+ return rc;
+}
+
+static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct lov_obd *lov = &obd->u.lov;
+ int i, rc = 0;
+
+ obd_getref(obd);
+
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ if (!lov->lov_tgts[i])
+ continue;
+
+ /* Skip quota check on the administratively disabled OSTs. */
+ if (!lov->lov_tgts[i]->ltd_activate) {
+ CWARN("lov idx %d was administratively disabled, skip quotacheck on it.\n",
+ i);
+ continue;
+ }
+
+ if (!lov->lov_tgts[i]->ltd_active) {
+ CERROR("lov idx %d inactive\n", i);
+ rc = -EIO;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ int err;
+
+ if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate)
+ continue;
+
+ err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
+ if (err && !rc)
+ rc = err;
+ }
+
+out:
+ obd_putref(obd);
+
+ return rc;
+}
+
+static struct obd_ops lov_obd_ops = {
+ .o_owner = THIS_MODULE,
+ .o_setup = lov_setup,
+ .o_precleanup = lov_precleanup,
+ .o_cleanup = lov_cleanup,
+ /*.o_process_config = lov_process_config,*/
+ .o_connect = lov_connect,
+ .o_disconnect = lov_disconnect,
+ .o_statfs = lov_statfs,
+ .o_statfs_async = lov_statfs_async,
+ .o_packmd = lov_packmd,
+ .o_unpackmd = lov_unpackmd,
+ .o_create = lov_create,
+ .o_destroy = lov_destroy,
+ .o_getattr_async = lov_getattr_async,
+ .o_setattr_async = lov_setattr_async,
+ .o_adjust_kms = lov_adjust_kms,
+ .o_find_cbdata = lov_find_cbdata,
+ .o_iocontrol = lov_iocontrol,
+ .o_get_info = lov_get_info,
+ .o_set_info_async = lov_set_info_async,
+ .o_notify = lov_notify,
+ .o_pool_new = lov_pool_new,
+ .o_pool_rem = lov_pool_remove,
+ .o_pool_add = lov_pool_add,
+ .o_pool_del = lov_pool_del,
+ .o_getref = lov_getref,
+ .o_putref = lov_putref,
+ .o_quotactl = lov_quotactl,
+ .o_quotacheck = lov_quotacheck,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+static int __init lov_init(void)
+{
+ struct lprocfs_static_vars lvars = { NULL };
+ int rc;
+
+ /* print an address of _any_ initialized kernel symbol from this
+ * module, to allow debugging with gdb that doesn't support data
+ * symbols from modules.*/
+ CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+ rc = lu_kmem_init(lov_caches);
+ if (rc)
+ return rc;
+
+ lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+ sizeof(struct lov_oinfo),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (lov_oinfo_slab == NULL) {
+ lu_kmem_fini(lov_caches);
+ return -ENOMEM;
+ }
+ lprocfs_lov_init_vars(&lvars);
+
+ rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars,
+ LUSTRE_LOV_NAME, &lov_device_type);
+
+ if (rc) {
+ kmem_cache_destroy(lov_oinfo_slab);
+ lu_kmem_fini(lov_caches);
+ }
+
+ return rc;
+}
+
+static void /*__exit*/ lov_exit(void)
+{
+ class_unregister_type(LUSTRE_LOV_NAME);
+ kmem_cache_destroy(lov_oinfo_slab);
+
+ lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(lov_init);
+module_exit(lov_exit);
diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
new file mode 100644
index 000000000..a22342fa7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_object.c
@@ -0,0 +1,1001 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+#include "../include/lclient.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+ int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+ struct lov_object *lov,
+ const struct cl_object_conf *conf,
+ union lov_layout_state *state);
+ int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state);
+ void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state);
+ void (*llo_install)(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state);
+ int (*llo_print)(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o);
+ int (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage);
+ int (*llo_lock_init)(const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *io);
+ int (*llo_io_init)(const struct lu_env *env,
+ struct cl_object *obj, struct cl_io *io);
+ int (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static void lov_install_empty(const struct lu_env *env,
+ struct lov_object *lov,
+ union lov_layout_state *state)
+{
+ /*
+ * File without objects.
+ */
+}
+
+static int lov_init_empty(const struct lu_env *env,
+ struct lov_device *dev, struct lov_object *lov,
+ const struct cl_object_conf *conf,
+ union lov_layout_state *state)
+{
+ return 0;
+}
+
+static void lov_install_raid0(const struct lu_env *env,
+ struct lov_object *lov,
+ union lov_layout_state *state)
+{
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+ struct cl_device *dev,
+ const struct lu_fid *fid,
+ const struct cl_object_conf *conf)
+{
+ struct lu_object *o;
+
+ o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+ LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+ return lu2cl(o);
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+ struct cl_object *stripe, struct lov_layout_raid0 *r0,
+ int idx)
+{
+ struct cl_object_header *hdr;
+ struct cl_object_header *subhdr;
+ struct cl_object_header *parent;
+ struct lov_oinfo *oinfo;
+ int result;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+ /* For sanity:test_206.
+ * Do not leave the object in cache to avoid accessing
+ * freed memory. This is because osc_object is referring to
+ * lov_oinfo of lsm_stripe_data which will be freed due to
+ * this failure. */
+ cl_object_kill(env, stripe);
+ cl_object_put(env, stripe);
+ return -EIO;
+ }
+
+ hdr = cl_object_header(lov2cl(lov));
+ subhdr = cl_object_header(stripe);
+
+ oinfo = lov->lo_lsm->lsm_oinfo[idx];
+ CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID
+ " idx: %d gen: %d\n",
+ PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
+ PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi),
+ oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+ /* reuse ->coh_attr_guard to protect coh_parent change */
+ spin_lock(&subhdr->coh_attr_guard);
+ parent = subhdr->coh_parent;
+ if (parent == NULL) {
+ subhdr->coh_parent = hdr;
+ spin_unlock(&subhdr->coh_attr_guard);
+ subhdr->coh_nesting = hdr->coh_nesting + 1;
+ lu_object_ref_add(&stripe->co_lu, "lov-parent", lov);
+ r0->lo_sub[idx] = cl2lovsub(stripe);
+ r0->lo_sub[idx]->lso_super = lov;
+ r0->lo_sub[idx]->lso_index = idx;
+ result = 0;
+ } else {
+ struct lu_object *old_obj;
+ struct lov_object *old_lov;
+ unsigned int mask = D_INODE;
+
+ spin_unlock(&subhdr->coh_attr_guard);
+ old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+ LASSERT(old_obj != NULL);
+ old_lov = cl2lov(lu2cl(old_obj));
+ if (old_lov->lo_layout_invalid) {
+ /* the object's layout has already changed but isn't
+ * refreshed */
+ lu_object_unhash(env, &stripe->co_lu);
+ result = -EAGAIN;
+ } else {
+ mask = D_ERROR;
+ result = -EIO;
+ }
+
+ LU_OBJECT_DEBUG(mask, env, &stripe->co_lu,
+ "stripe %d is already owned.\n", idx);
+ LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n");
+ LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+ cl_object_put(env, stripe);
+ }
+ return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env,
+ struct lov_device *dev, struct lov_object *lov,
+ const struct cl_object_conf *conf,
+ union lov_layout_state *state)
+{
+ int result;
+ int i;
+
+ struct cl_object *stripe;
+ struct lov_thread_info *lti = lov_env_info(env);
+ struct cl_object_conf *subconf = &lti->lti_stripe_conf;
+ struct lov_stripe_md *lsm = conf->u.coc_md->lsm;
+ struct lu_fid *ofid = &lti->lti_fid;
+ struct lov_layout_raid0 *r0 = &state->raid0;
+
+ if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
+ dump_lsm(D_ERROR, lsm);
+ LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
+ LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
+ }
+
+ LASSERT(lov->lo_lsm == NULL);
+ lov->lo_lsm = lsm_addref(lsm);
+ r0->lo_nr = lsm->lsm_stripe_count;
+ LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+ OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
+ if (r0->lo_sub != NULL) {
+ result = 0;
+ subconf->coc_inode = conf->coc_inode;
+ spin_lock_init(&r0->lo_sub_lock);
+ /*
+ * Create stripe cl_objects.
+ */
+ for (i = 0; i < r0->lo_nr && result == 0; ++i) {
+ struct cl_device *subdev;
+ struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+ int ost_idx = oinfo->loi_ost_idx;
+
+ if (lov_oinfo_is_dummy(oinfo))
+ continue;
+
+ result = ostid_to_fid(ofid, &oinfo->loi_oi,
+ oinfo->loi_ost_idx);
+ if (result != 0)
+ goto out;
+
+ subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+ subconf->u.coc_oinfo = oinfo;
+ LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+ /* In the function below, .hs_keycmp resolves to
+ * lu_obj_hop_keycmp() */
+ /* coverity[overrun-buffer-val] */
+ stripe = lov_sub_find(env, subdev, ofid, subconf);
+ if (!IS_ERR(stripe)) {
+ result = lov_init_sub(env, lov, stripe, r0, i);
+ if (result == -EAGAIN) { /* try again */
+ --i;
+ result = 0;
+ }
+ } else {
+ result = PTR_ERR(stripe);
+ }
+ }
+ } else
+ result = -ENOMEM;
+out:
+ return result;
+}
+
+static int lov_init_released(const struct lu_env *env,
+ struct lov_device *dev, struct lov_object *lov,
+ const struct cl_object_conf *conf,
+ union lov_layout_state *state)
+{
+ struct lov_stripe_md *lsm = conf->u.coc_md->lsm;
+
+ LASSERT(lsm != NULL);
+ LASSERT(lsm_is_released(lsm));
+ LASSERT(lov->lo_lsm == NULL);
+
+ lov->lo_lsm = lsm_addref(lsm);
+ return 0;
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state)
+{
+ LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
+
+ lov_layout_wait(env, lov);
+
+ cl_object_prune(env, &lov->lo_cl);
+ return 0;
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+ struct lovsub_object *los, int idx)
+{
+ struct cl_object *sub;
+ struct lov_layout_raid0 *r0;
+ struct lu_site *site;
+ struct lu_site_bkt_data *bkt;
+ wait_queue_t *waiter;
+
+ r0 = &lov->u.raid0;
+ LASSERT(r0->lo_sub[idx] == los);
+
+ sub = lovsub2cl(los);
+ site = sub->co_lu.lo_dev->ld_site;
+ bkt = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+ cl_object_kill(env, sub);
+ /* release a reference to the sub-object and ... */
+ lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+ cl_object_put(env, sub);
+
+ /* ... wait until it is actually destroyed---sub-object clears its
+ * ->lo_sub[] slot in lovsub_object_fini() */
+ if (r0->lo_sub[idx] == los) {
+ waiter = &lov_env_info(env)->lti_waiter;
+ init_waitqueue_entry(waiter, current);
+ add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1) {
+ /* this wait-queue is signaled at the end of
+ * lu_object_free(). */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_lock(&r0->lo_sub_lock);
+ if (r0->lo_sub[idx] == los) {
+ spin_unlock(&r0->lo_sub_lock);
+ schedule();
+ } else {
+ spin_unlock(&r0->lo_sub_lock);
+ set_current_state(TASK_RUNNING);
+ break;
+ }
+ }
+ remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+ }
+ LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state)
+{
+ struct lov_layout_raid0 *r0 = &state->raid0;
+ struct lov_stripe_md *lsm = lov->lo_lsm;
+ int i;
+
+ dump_lsm(D_INODE, lsm);
+
+ lov_layout_wait(env, lov);
+ if (r0->lo_sub != NULL) {
+ for (i = 0; i < r0->lo_nr; ++i) {
+ struct lovsub_object *los = r0->lo_sub[i];
+
+ if (los != NULL) {
+ cl_locks_prune(env, &los->lso_cl, 1);
+ /*
+ * If top-level object is to be evicted from
+ * the cache, so are its sub-objects.
+ */
+ lov_subobject_kill(env, lov, los, i);
+ }
+ }
+ }
+ cl_object_prune(env, &lov->lo_cl);
+ return 0;
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state)
+{
+ LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
+}
+
+static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state)
+{
+ struct lov_layout_raid0 *r0 = &state->raid0;
+
+ if (r0->lo_sub != NULL) {
+ OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
+ r0->lo_sub = NULL;
+ }
+
+ dump_lsm(D_INODE, lov->lo_lsm);
+ lov_free_memmd(&lov->lo_lsm);
+}
+
+static void lov_fini_released(const struct lu_env *env, struct lov_object *lov,
+ union lov_layout_state *state)
+{
+ dump_lsm(D_INODE, lov->lo_lsm);
+ lov_free_memmd(&lov->lo_lsm);
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o)
+{
+ (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+ return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o)
+{
+ struct lov_object *lov = lu2lov(o);
+ struct lov_layout_raid0 *r0 = lov_r0(lov);
+ struct lov_stripe_md *lsm = lov->lo_lsm;
+ int i;
+
+ (*p)(env, cookie, "stripes: %d, %s, lsm{%p 0x%08X %d %u %u}:\n",
+ r0->lo_nr, lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+ lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+ lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+ for (i = 0; i < r0->lo_nr; ++i) {
+ struct lu_object *sub;
+
+ if (r0->lo_sub[i] != NULL) {
+ sub = lovsub2lu(r0->lo_sub[i]);
+ lu_object_print(env, cookie, p, sub);
+ } else {
+ (*p)(env, cookie, "sub %d absent\n", i);
+ }
+ }
+ return 0;
+}
+
+static int lov_print_released(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o)
+{
+ struct lov_object *lov = lu2lov(o);
+ struct lov_stripe_md *lsm = lov->lo_lsm;
+
+ (*p)(env, cookie,
+ "released: %s, lsm{%p 0x%08X %d %u %u}:\n",
+ lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+ lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+ lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+ return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr)
+{
+ attr->cat_blocks = 0;
+ return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr)
+{
+ struct lov_object *lov = cl2lov(obj);
+ struct lov_layout_raid0 *r0 = lov_r0(lov);
+ struct cl_attr *lov_attr = &r0->lo_attr;
+ int result = 0;
+
+ /* this is called w/o holding type guard mutex, so it must be inside
+ * an on going IO otherwise lsm may be replaced.
+ * LU-2117: it turns out there exists one exception. For mmaped files,
+ * the lock of those files may be requested in the other file's IO
+ * context, and this function is called in ccc_lock_state(), it will
+ * hit this assertion.
+ * Anyway, it's still okay to call attr_get w/o type guard as layout
+ * can't go if locks exist. */
+ /* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */
+
+ if (!r0->lo_attr_valid) {
+ struct lov_stripe_md *lsm = lov->lo_lsm;
+ struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
+ __u64 kms = 0;
+
+ memset(lvb, 0, sizeof(*lvb));
+ /* XXX: timestamps can be negative by sanity:test_39m,
+ * how can it be? */
+ lvb->lvb_atime = LLONG_MIN;
+ lvb->lvb_ctime = LLONG_MIN;
+ lvb->lvb_mtime = LLONG_MIN;
+
+ /*
+ * XXX that should be replaced with a loop over sub-objects,
+ * doing cl_object_attr_get() on them. But for now, let's
+ * reuse old lov code.
+ */
+
+ /*
+ * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+ * happy. It's not needed, because new code uses
+ * ->coh_attr_guard spin-lock to protect consistency of
+ * sub-object attributes.
+ */
+ lov_stripe_lock(lsm);
+ result = lov_merge_lvb_kms(lsm, lvb, &kms);
+ lov_stripe_unlock(lsm);
+ if (result == 0) {
+ cl_lvb2attr(lov_attr, lvb);
+ lov_attr->cat_kms = kms;
+ r0->lo_attr_valid = 1;
+ }
+ }
+ if (result == 0) { /* merge results */
+ attr->cat_blocks = lov_attr->cat_blocks;
+ attr->cat_size = lov_attr->cat_size;
+ attr->cat_kms = lov_attr->cat_kms;
+ if (attr->cat_atime < lov_attr->cat_atime)
+ attr->cat_atime = lov_attr->cat_atime;
+ if (attr->cat_ctime < lov_attr->cat_ctime)
+ attr->cat_ctime = lov_attr->cat_ctime;
+ if (attr->cat_mtime < lov_attr->cat_mtime)
+ attr->cat_mtime = lov_attr->cat_mtime;
+ }
+ return result;
+}
+
+static const struct lov_layout_operations lov_dispatch[] = {
+ [LLT_EMPTY] = {
+ .llo_init = lov_init_empty,
+ .llo_delete = lov_delete_empty,
+ .llo_fini = lov_fini_empty,
+ .llo_install = lov_install_empty,
+ .llo_print = lov_print_empty,
+ .llo_page_init = lov_page_init_empty,
+ .llo_lock_init = lov_lock_init_empty,
+ .llo_io_init = lov_io_init_empty,
+ .llo_getattr = lov_attr_get_empty
+ },
+ [LLT_RAID0] = {
+ .llo_init = lov_init_raid0,
+ .llo_delete = lov_delete_raid0,
+ .llo_fini = lov_fini_raid0,
+ .llo_install = lov_install_raid0,
+ .llo_print = lov_print_raid0,
+ .llo_page_init = lov_page_init_raid0,
+ .llo_lock_init = lov_lock_init_raid0,
+ .llo_io_init = lov_io_init_raid0,
+ .llo_getattr = lov_attr_get_raid0
+ },
+ [LLT_RELEASED] = {
+ .llo_init = lov_init_released,
+ .llo_delete = lov_delete_empty,
+ .llo_fini = lov_fini_released,
+ .llo_install = lov_install_empty,
+ .llo_print = lov_print_released,
+ .llo_page_init = lov_page_init_empty,
+ .llo_lock_init = lov_lock_init_empty,
+ .llo_io_init = lov_io_init_released,
+ .llo_getattr = lov_attr_get_empty
+ }
+};
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...) \
+({ \
+ struct lov_object *__obj = (obj); \
+ enum lov_layout_type __llt; \
+ \
+ __llt = __obj->lo_type; \
+ LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \
+ lov_dispatch[__llt].op(__VA_ARGS__); \
+})
+
+/**
+ * Return lov_layout_type associated with a given lsm
+ */
+static enum lov_layout_type lov_type(struct lov_stripe_md *lsm)
+{
+ if (lsm == NULL)
+ return LLT_EMPTY;
+ if (lsm_is_released(lsm))
+ return LLT_RELEASED;
+ return LLT_RAID0;
+}
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+ if (lov->lo_owner != current)
+ down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+ if (lov->lo_owner != current)
+ up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...) \
+({ \
+ struct lov_object *__obj = (obj); \
+ int __lock = !!(lock); \
+ typeof(lov_dispatch[0].op(__VA_ARGS__)) __result; \
+ \
+ if (__lock) \
+ lov_conf_freeze(__obj); \
+ __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__); \
+ if (__lock) \
+ lov_conf_thaw(__obj); \
+ __result; \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...) \
+ LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...) \
+do { \
+ struct lov_object *__obj = (obj); \
+ enum lov_layout_type __llt; \
+ \
+ lov_conf_freeze(__obj); \
+ __llt = __obj->lo_type; \
+ LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \
+ lov_dispatch[__llt].op(__VA_ARGS__); \
+ lov_conf_thaw(__obj); \
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+ LASSERT(lov->lo_owner != current);
+ down_write(&lov->lo_type_guard);
+ LASSERT(lov->lo_owner == NULL);
+ lov->lo_owner = current;
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+ lov->lo_owner = NULL;
+ up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+ struct l_wait_info lwi = { 0 };
+
+ while (atomic_read(&lov->lo_active_ios) > 0) {
+ CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+ PFID(lu_object_fid(lov2lu(lov))),
+ atomic_read(&lov->lo_active_ios));
+
+ l_wait_event(lov->lo_waitq,
+ atomic_read(&lov->lo_active_ios) == 0, &lwi);
+ }
+ return 0;
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+ struct lov_object *lov,
+ const struct cl_object_conf *conf)
+{
+ int result;
+ enum lov_layout_type llt = LLT_EMPTY;
+ union lov_layout_state *state = &lov->u;
+ const struct lov_layout_operations *old_ops;
+ const struct lov_layout_operations *new_ops;
+
+ struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+ void *cookie;
+ struct lu_env *env;
+ int refcheck;
+
+ LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+ if (conf->u.coc_md != NULL)
+ llt = lov_type(conf->u.coc_md->lsm);
+ LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+
+ cookie = cl_env_reenter();
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env)) {
+ cl_env_reexit(cookie);
+ return PTR_ERR(env);
+ }
+
+ CDEBUG(D_INODE, DFID" from %s to %s\n",
+ PFID(lu_object_fid(lov2lu(lov))),
+ llt2str(lov->lo_type), llt2str(llt));
+
+ old_ops = &lov_dispatch[lov->lo_type];
+ new_ops = &lov_dispatch[llt];
+
+ result = old_ops->llo_delete(env, lov, &lov->u);
+ if (result == 0) {
+ old_ops->llo_fini(env, lov, &lov->u);
+
+ LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+ LASSERT(hdr->coh_tree.rnode == NULL);
+ LASSERT(hdr->coh_pages == 0);
+
+ lov->lo_type = LLT_EMPTY;
+ result = new_ops->llo_init(env,
+ lu2lov_dev(lov->lo_cl.co_lu.lo_dev),
+ lov, conf, state);
+ if (result == 0) {
+ new_ops->llo_install(env, lov, state);
+ lov->lo_type = llt;
+ } else {
+ new_ops->llo_delete(env, lov, state);
+ new_ops->llo_fini(env, lov, state);
+ /* this file becomes an EMPTY file. */
+ }
+ }
+
+ cl_env_put(env, &refcheck);
+ cl_env_reexit(cookie);
+ return result;
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf)
+{
+ struct lov_device *dev = lu2lov_dev(obj->lo_dev);
+ struct lov_object *lov = lu2lov(obj);
+ const struct cl_object_conf *cconf = lu2cl_conf(conf);
+ union lov_layout_state *set = &lov->u;
+ const struct lov_layout_operations *ops;
+ int result;
+
+ init_rwsem(&lov->lo_type_guard);
+ atomic_set(&lov->lo_active_ios, 0);
+ init_waitqueue_head(&lov->lo_waitq);
+
+ cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+ /* no locking is necessary, as object is being created */
+ lov->lo_type = lov_type(cconf->u.coc_md->lsm);
+ ops = &lov_dispatch[lov->lo_type];
+ result = ops->llo_init(env, dev, lov, cconf, set);
+ if (result == 0)
+ ops->llo_install(env, lov, set);
+ return result;
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf)
+{
+ struct lov_stripe_md *lsm = NULL;
+ struct lov_object *lov = cl2lov(obj);
+ int result = 0;
+
+ lov_conf_lock(lov);
+ if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+ lov->lo_layout_invalid = true;
+ result = 0;
+ goto out;
+ }
+
+ if (conf->coc_opc == OBJECT_CONF_WAIT) {
+ if (lov->lo_layout_invalid &&
+ atomic_read(&lov->lo_active_ios) > 0) {
+ lov_conf_unlock(lov);
+ result = lov_layout_wait(env, lov);
+ lov_conf_lock(lov);
+ }
+ goto out;
+ }
+
+ LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+ if (conf->u.coc_md != NULL)
+ lsm = conf->u.coc_md->lsm;
+ if ((lsm == NULL && lov->lo_lsm == NULL) ||
+ ((lsm != NULL && lov->lo_lsm != NULL) &&
+ (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) &&
+ (lov->lo_lsm->lsm_pattern == lsm->lsm_pattern))) {
+ /* same version of layout */
+ lov->lo_layout_invalid = false;
+ result = 0;
+ goto out;
+ }
+
+ /* will change layout - check if there still exists active IO. */
+ if (atomic_read(&lov->lo_active_ios) > 0) {
+ lov->lo_layout_invalid = true;
+ result = -EBUSY;
+ goto out;
+ }
+
+ lov->lo_layout_invalid = lov_layout_change(env, lov, conf);
+
+out:
+ lov_conf_unlock(lov);
+ CDEBUG(D_INODE, DFID" lo_layout_invalid=%d\n",
+ PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid);
+ return result;
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+ struct lov_object *lov = lu2lov(obj);
+
+ LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+ struct lov_object *lov = lu2lov(obj);
+
+ LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+ lu_object_fini(obj);
+ OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o)
+{
+ return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage)
+{
+ return LOV_2DISPATCH_NOLOCK(cl2lov(obj),
+ llo_page_init, env, obj, page, vmpage);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io)
+{
+ CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+ return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+ !io->ci_ignore_layout, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr)
+{
+ /* do not take lock, as this function is called under a
+ * spin-lock. Layout is protected from changing by ongoing IO. */
+ return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid)
+{
+ /*
+ * No dispatch is required here, as no layout implements this.
+ */
+ return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io)
+{
+ /* No need to lock because we've taken one refcount of layout. */
+ return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+ io);
+}
+
+static const struct cl_object_operations lov_ops = {
+ .coo_page_init = lov_page_init,
+ .coo_lock_init = lov_lock_init,
+ .coo_io_init = lov_io_init,
+ .coo_attr_get = lov_attr_get,
+ .coo_attr_set = lov_attr_set,
+ .coo_conf_set = lov_conf_set
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+ .loo_object_init = lov_object_init,
+ .loo_object_delete = lov_object_delete,
+ .loo_object_release = NULL,
+ .loo_object_free = lov_object_free,
+ .loo_object_print = lov_object_print,
+ .loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *unused,
+ struct lu_device *dev)
+{
+ struct lov_object *lov;
+ struct lu_object *obj;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, GFP_NOFS);
+ if (lov != NULL) {
+ obj = lov2lu(lov);
+ lu_object_init(obj, NULL, dev);
+ lov->lo_cl.co_ops = &lov_ops;
+ lov->lo_type = -1; /* invalid, to catch uninitialized type */
+ /*
+ * object io operation vector (cl_object::co_iop) is installed
+ * later in lov_object_init(), as different vectors are used
+ * for object with different layouts.
+ */
+ obj->lo_ops = &lov_lu_obj_ops;
+ } else
+ obj = NULL;
+ return obj;
+}
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+ struct lov_stripe_md *lsm = NULL;
+
+ lov_conf_freeze(lov);
+ if (lov->lo_lsm != NULL) {
+ lsm = lsm_addref(lov->lo_lsm);
+ CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+ lsm, atomic_read(&lsm->lsm_refc),
+ lov->lo_layout_invalid, current);
+ }
+ lov_conf_thaw(lov);
+ return lsm;
+}
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm)
+{
+ if (lsm == NULL)
+ return;
+
+ CDEBUG(D_INODE, "lsm %p decref %d by %p.\n",
+ lsm, atomic_read(&lsm->lsm_refc), current);
+
+ lov_free_memmd(&lsm);
+}
+
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj)
+{
+ struct lu_object *luobj;
+ struct lov_stripe_md *lsm = NULL;
+
+ if (clobj == NULL)
+ return NULL;
+
+ luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+ &lov_device_type);
+ if (luobj != NULL)
+ lsm = lov_lsm_addref(lu2lov(luobj));
+ return lsm;
+}
+EXPORT_SYMBOL(lov_lsm_get);
+
+void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm)
+{
+ if (lsm != NULL)
+ lov_free_memmd(&lsm);
+}
+EXPORT_SYMBOL(lov_lsm_put);
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+ struct lu_object *luobj;
+ int rc = 0;
+
+ luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+ &lov_device_type);
+ if (luobj != NULL) {
+ struct lov_object *lov = lu2lov(luobj);
+
+ lov_conf_freeze(lov);
+ switch (lov->lo_type) {
+ case LLT_RAID0: {
+ struct lov_stripe_md *lsm;
+ int i;
+
+ lsm = lov->lo_lsm;
+ LASSERT(lsm != NULL);
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (loi->loi_ar.ar_rc && !rc)
+ rc = loi->loi_ar.ar_rc;
+ loi->loi_ar.ar_rc = 0;
+ }
+ }
+ case LLT_RELEASED:
+ case LLT_EMPTY:
+ break;
+ default:
+ LBUG();
+ }
+ lov_conf_thaw(lov);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_offset.c b/drivers/staging/lustre/lustre/lov/lov_offset.c
new file mode 100644
index 000000000..9c8c77c05
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_offset.c
@@ -0,0 +1,264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+
+#include "lov_internal.h"
+
+/* compute object size given "stripeno" and the ost size */
+u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size,
+ int stripeno)
+{
+ unsigned long ssize = lsm->lsm_stripe_size;
+ unsigned long stripe_size;
+ u64 swidth;
+ u64 lov_size;
+ int magic = lsm->lsm_magic;
+
+ if (ost_size == 0)
+ return 0;
+
+ LASSERT(lsm_op_find(magic) != NULL);
+ lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
+
+ /* lov_do_div64(a, b) returns a % b, and a = a / b */
+ stripe_size = lov_do_div64(ost_size, ssize);
+ if (stripe_size)
+ lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+ else
+ lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+ return lov_size;
+}
+
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file. for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe. callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file. end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ * S E
+ * ---------------------------------------------------------------------
+ * | 0 | 1 | 2 | 0 | 1 | 2 |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ * S E
+ * -----------------------------------
+ * | 0 | 1 | 2 |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ * S E
+ * ---------------------------------------------------------------------
+ * | 1 | 1 | 1 | 1 | 1 | 1 |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct. consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe. this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question; 0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off,
+ int stripeno, u64 *obdoff)
+{
+ unsigned long ssize = lsm->lsm_stripe_size;
+ u64 stripe_off, this_stripe, swidth;
+ int magic = lsm->lsm_magic;
+ int ret = 0;
+
+ if (lov_off == OBD_OBJECT_EOF) {
+ *obdoff = OBD_OBJECT_EOF;
+ return 0;
+ }
+
+ LASSERT(lsm_op_find(magic) != NULL);
+
+ lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
+ &swidth);
+
+ /* lov_do_div64(a, b) returns a % b, and a = a / b */
+ stripe_off = lov_do_div64(lov_off, swidth);
+
+ this_stripe = (u64)stripeno * ssize;
+ if (stripe_off < this_stripe) {
+ stripe_off = 0;
+ ret = -1;
+ } else {
+ stripe_off -= this_stripe;
+
+ if (stripe_off >= ssize) {
+ stripe_off = ssize;
+ ret = 1;
+ }
+ }
+
+ *obdoff = lov_off * ssize + stripe_off;
+ return ret;
+}
+
+/* Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ * S
+ * ---------------------------------------------------------------------
+ * | 0 | 1 | 2 | 0 | 1 | 2 |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ * S
+ * ---------------------------------------------------------------------
+ * | 0 | 1 | 2 | 0 | 1 | 2 |
+ * ---------------------------------------------------------------------
+ */
+u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size,
+ int stripeno)
+{
+ unsigned long ssize = lsm->lsm_stripe_size;
+ u64 stripe_off, this_stripe, swidth;
+ int magic = lsm->lsm_magic;
+
+ if (file_size == OBD_OBJECT_EOF)
+ return OBD_OBJECT_EOF;
+
+ LASSERT(lsm_op_find(magic) != NULL);
+ lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
+ &swidth);
+
+ /* lov_do_div64(a, b) returns a % b, and a = a / b */
+ stripe_off = lov_do_div64(file_size, swidth);
+
+ this_stripe = (u64)stripeno * ssize;
+ if (stripe_off < this_stripe) {
+ /* Move to end of previous stripe, or zero */
+ if (file_size > 0) {
+ file_size--;
+ stripe_off = ssize;
+ } else {
+ stripe_off = 0;
+ }
+ } else {
+ stripe_off -= this_stripe;
+
+ if (stripe_off >= ssize) {
+ /* Clamp to end of this stripe */
+ stripe_off = ssize;
+ }
+ }
+
+ return (file_size * ssize + stripe_off);
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent. this returns true if the given
+ * stripe does intersect with the lov extent. */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+ u64 start, u64 end, u64 *obd_start, u64 *obd_end)
+{
+ int start_side, end_side;
+
+ start_side = lov_stripe_offset(lsm, start, stripeno, obd_start);
+ end_side = lov_stripe_offset(lsm, end, stripeno, obd_end);
+
+ CDEBUG(D_INODE, "[%llu->%llu] -> [(%d) %llu->%llu (%d)]\n",
+ start, end, start_side, *obd_start, *obd_end, end_side);
+
+ /* this stripe doesn't intersect the file extent when neither
+ * start or the end intersected the stripe and obd_start and
+ * obd_end got rounded up to the save value. */
+ if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+ return 0;
+
+ /* as mentioned in the lov_stripe_offset commentary, end
+ * might have been shifted in the wrong direction. This
+ * happens when an end offset is before the stripe when viewed
+ * through the "mod stripe size" math. we detect it being shifted
+ * in the wrong direction and touch it up.
+ * interestingly, this can't underflow since end must be > start
+ * if we passed through the previous check.
+ * (should we assert for that somewhere?) */
+ if (end_side != 0)
+ (*obd_end)--;
+
+ return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off)
+{
+ unsigned long ssize = lsm->lsm_stripe_size;
+ u64 stripe_off, swidth;
+ int magic = lsm->lsm_magic;
+
+ LASSERT(lsm_op_find(magic) != NULL);
+ lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
+
+ stripe_off = lov_do_div64(lov_off, swidth);
+
+ /* Puts stripe_off/ssize result into stripe_off */
+ lov_do_div64(stripe_off, ssize);
+
+ return stripe_off;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c
new file mode 100644
index 000000000..5356d5324
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_pack.c
@@ -0,0 +1,511 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../include/lustre_net.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre/lustre_user.h"
+
+#include "lov_internal.h"
+
+void lov_dump_lmm_common(int level, void *lmmp)
+{
+ struct lov_mds_md *lmm = lmmp;
+ struct ost_id oi;
+
+ lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+ CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+ POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+ le32_to_cpu(lmm->lmm_pattern));
+ CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+ le32_to_cpu(lmm->lmm_stripe_size),
+ le16_to_cpu(lmm->lmm_stripe_count),
+ le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+ int stripe_count)
+{
+ int i;
+
+ if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+ CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+ stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+ return;
+ }
+
+ for (i = 0; i < stripe_count; ++i, ++lod) {
+ struct ost_id oi;
+
+ ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+ CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+ le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+ }
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+ lov_dump_lmm_common(level, lmm);
+ lov_dump_lmm_objects(level, lmm->lmm_objects,
+ le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+ lov_dump_lmm_common(level, lmm);
+ CDEBUG(level, "pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+ lov_dump_lmm_objects(level, lmm->lmm_objects,
+ le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+ int magic;
+
+ magic = le32_to_cpu(((struct lov_mds_md *)lmm)->lmm_magic);
+ switch (magic) {
+ case LOV_MAGIC_V1:
+ lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)lmm);
+ break;
+ case LOV_MAGIC_V3:
+ lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)lmm);
+ break;
+ default:
+ CDEBUG(level, "unrecognized lmm_magic %x, assuming %x\n",
+ magic, LOV_MAGIC_V1);
+ lov_dump_lmm_common(level, lmm);
+ break;
+ }
+}
+
+/* Pack LOV object metadata for disk storage. It is packed in LE byte
+ * order and is opaque to the networking layer.
+ *
+ * XXX In the future, this will be enhanced to get the EA size from the
+ * underlying OSC device(s) to get their EA sizes so we can stack
+ * LOVs properly. For now lov_mds_md_size() just assumes one u64
+ * per stripe.
+ */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+ struct lov_stripe_md *lsm)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lov_obd *lov = &obd->u.lov;
+ struct lov_mds_md_v1 *lmmv1;
+ struct lov_mds_md_v3 *lmmv3;
+ __u16 stripe_count;
+ struct lov_ost_data_v1 *lmm_objects;
+ int lmm_size, lmm_magic;
+ int i;
+ int cplen = 0;
+
+ if (lsm) {
+ lmm_magic = lsm->lsm_magic;
+ } else {
+ if (lmmp && *lmmp)
+ lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+ else
+ /* lsm == NULL and lmmp == NULL */
+ lmm_magic = LOV_MAGIC;
+ }
+
+ if ((lmm_magic != LOV_MAGIC_V1) &&
+ (lmm_magic != LOV_MAGIC_V3)) {
+ CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+ lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+ return -EINVAL;
+
+ }
+
+ if (lsm) {
+ /* If we are just sizing the EA, limit the stripe count
+ * to the actual number of OSTs in this filesystem. */
+ if (!lmmp) {
+ stripe_count = lov_get_stripecnt(lov, lmm_magic,
+ lsm->lsm_stripe_count);
+ lsm->lsm_stripe_count = stripe_count;
+ } else if (!lsm_is_released(lsm)) {
+ stripe_count = lsm->lsm_stripe_count;
+ } else {
+ stripe_count = 0;
+ }
+ } else {
+ /* No need to allocate more than maximum supported stripes.
+ * Anyway, this is pretty inaccurate since ld_tgt_count now
+ * represents max index and we should rely on the actual number
+ * of OSTs instead */
+ stripe_count = lov_mds_md_max_stripe_count(
+ lov->lov_ocd.ocd_max_easize, lmm_magic);
+
+ if (stripe_count > lov->desc.ld_tgt_count)
+ stripe_count = lov->desc.ld_tgt_count;
+ }
+
+ /* XXX LOV STACKING call into osc for sizes */
+ lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+
+ if (!lmmp)
+ return lmm_size;
+
+ if (*lmmp && !lsm) {
+ stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count);
+ lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+ OBD_FREE_LARGE(*lmmp, lmm_size);
+ *lmmp = NULL;
+ return 0;
+ }
+
+ if (!*lmmp) {
+ OBD_ALLOC_LARGE(*lmmp, lmm_size);
+ if (!*lmmp)
+ return -ENOMEM;
+ }
+
+ CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+ lmm_magic, lmm_size);
+
+ lmmv1 = *lmmp;
+ lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+ if (lmm_magic == LOV_MAGIC_V3)
+ lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+ else
+ lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
+
+ if (!lsm)
+ return lmm_size;
+
+ /* lmmv1 and lmmv3 point to the same struct and have the
+ * same first fields
+ */
+ lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+ lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+ lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count);
+ lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+ lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+ if (lsm->lsm_magic == LOV_MAGIC_V3) {
+ cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name,
+ sizeof(lmmv3->lmm_pool_name));
+ if (cplen >= sizeof(lmmv3->lmm_pool_name))
+ return -E2BIG;
+ lmm_objects = lmmv3->lmm_objects;
+ } else {
+ lmm_objects = lmmv1->lmm_objects;
+ }
+
+ for (i = 0; i < stripe_count; i++) {
+ struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+ /* XXX LOV STACKING call down to osc_packmd() to do packing */
+ LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID
+ " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi),
+ i, stripe_count, loi->loi_ost_idx);
+ ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+ lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+ lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+ }
+
+ return lmm_size;
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+ __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+ if (!stripe_count)
+ stripe_count = lov->desc.ld_default_stripe_count;
+ if (stripe_count > lov->desc.ld_active_tgt_count)
+ stripe_count = lov->desc.ld_active_tgt_count;
+ if (!stripe_count)
+ stripe_count = 1;
+
+ /* stripe count is based on whether ldiskfs can handle
+ * larger EA sizes */
+ if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+ lov->lov_ocd.ocd_max_easize)
+ max_stripes = lov_mds_md_max_stripe_count(
+ lov->lov_ocd.ocd_max_easize, magic);
+
+ if (stripe_count > max_stripes)
+ stripe_count = max_stripes;
+
+ return stripe_count;
+}
+
+
+static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count)
+{
+ int rc;
+
+ if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
+ char *buffer;
+ int sz;
+
+ CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+ le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+ sz = lmm_bytes * 2 + 1;
+ OBD_ALLOC_LARGE(buffer, sz);
+ if (buffer != NULL) {
+ int i;
+
+ for (i = 0; i < lmm_bytes; i++)
+ sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+ buffer[sz - 1] = '\0';
+ CERROR("%s\n", buffer);
+ OBD_FREE_LARGE(buffer, sz);
+ }
+ return -EINVAL;
+ }
+ rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
+ lmm_bytes, stripe_count);
+ return rc;
+}
+
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+ int pattern, int magic)
+{
+ int i, lsm_size;
+
+ CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count);
+
+ *lsmp = lsm_alloc_plain(stripe_count, &lsm_size);
+ if (!*lsmp) {
+ CERROR("can't allocate lsmp stripe_count %d\n", stripe_count);
+ return -ENOMEM;
+ }
+
+ atomic_set(&(*lsmp)->lsm_refc, 1);
+ spin_lock_init(&(*lsmp)->lsm_lock);
+ (*lsmp)->lsm_magic = magic;
+ (*lsmp)->lsm_stripe_count = stripe_count;
+ (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
+ (*lsmp)->lsm_pattern = pattern;
+ (*lsmp)->lsm_pool_name[0] = '\0';
+ (*lsmp)->lsm_layout_gen = 0;
+ if (stripe_count > 0)
+ (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
+
+ for (i = 0; i < stripe_count; i++)
+ loi_init((*lsmp)->lsm_oinfo[i]);
+
+ return lsm_size;
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+ struct lov_stripe_md *lsm = *lsmp;
+ int refc;
+
+ *lsmp = NULL;
+ LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+ refc = atomic_dec_return(&lsm->lsm_refc);
+ if (refc == 0) {
+ LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+ lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
+ }
+ return refc;
+}
+
+
+/* Unpack LOV object metadata from disk storage. It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+ struct lov_mds_md *lmm, int lmm_bytes)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lov_obd *lov = &obd->u.lov;
+ int rc = 0, lsm_size;
+ __u16 stripe_count;
+ __u32 magic;
+ __u32 pattern;
+
+ /* If passed an MDS struct use values from there, otherwise defaults */
+ if (lmm) {
+ rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count);
+ if (rc)
+ return rc;
+ magic = le32_to_cpu(lmm->lmm_magic);
+ } else {
+ magic = LOV_MAGIC;
+ stripe_count = lov_get_stripecnt(lov, magic, 0);
+ }
+
+ /* If we aren't passed an lsmp struct, we just want the size */
+ if (!lsmp) {
+ /* XXX LOV STACKING call into osc for sizes */
+ LBUG();
+ return lov_stripe_md_size(stripe_count);
+ }
+ /* If we are passed an allocated struct but nothing to unpack, free */
+ if (*lsmp && !lmm) {
+ lov_free_memmd(lsmp);
+ return 0;
+ }
+
+ pattern = le32_to_cpu(lmm->lmm_pattern);
+ lsm_size = lov_alloc_memmd(lsmp, stripe_count, pattern, magic);
+ if (lsm_size < 0)
+ return lsm_size;
+
+ /* If we are passed a pointer but nothing to unpack, we only alloc */
+ if (!lmm)
+ return lsm_size;
+
+ LASSERT(lsm_op_find(magic) != NULL);
+ rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm);
+ if (rc) {
+ lov_free_memmd(lsmp);
+ return rc;
+ }
+
+ return lsm_size;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ */
+int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
+ struct lov_user_md *lump)
+{
+ /*
+ * XXX huge struct allocated on stack.
+ */
+ /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+ struct lov_user_md_v3 lum;
+ struct lov_mds_md *lmmk = NULL;
+ int rc, lmm_size;
+ int lum_size;
+ mm_segment_t seg;
+
+ if (!lsm)
+ return -ENODATA;
+
+ /*
+ * "Switch to kernel segment" to allow copying from kernel space by
+ * copy_{to,from}_user().
+ */
+ seg = get_fs();
+ set_fs(KERNEL_DS);
+
+ /* we only need the header part from user space to get lmm_magic and
+ * lmm_stripe_count, (the header part is common to v1 and v3) */
+ lum_size = sizeof(struct lov_user_md_v1);
+ if (copy_from_user(&lum, lump, lum_size)) {
+ rc = -EFAULT;
+ goto out_set;
+ } else if ((lum.lmm_magic != LOV_USER_MAGIC) &&
+ (lum.lmm_magic != LOV_USER_MAGIC_V3)) {
+ rc = -EINVAL;
+ goto out_set;
+ }
+
+ if (lum.lmm_stripe_count &&
+ (lum.lmm_stripe_count < lsm->lsm_stripe_count)) {
+ /* Return right size of stripe to user */
+ lum.lmm_stripe_count = lsm->lsm_stripe_count;
+ rc = copy_to_user(lump, &lum, lum_size);
+ rc = -EOVERFLOW;
+ goto out_set;
+ }
+ rc = lov_packmd(exp, &lmmk, lsm);
+ if (rc < 0)
+ goto out_set;
+ lmm_size = rc;
+ rc = 0;
+
+ /* FIXME: Bug 1185 - copy fields properly when structs change */
+ /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+ CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
+ CLASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lmmk->lmm_objects[0]));
+
+ if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+ ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
+ (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) {
+ lustre_swab_lov_mds_md(lmmk);
+ lustre_swab_lov_user_md_objects(
+ (struct lov_user_ost_data *)lmmk->lmm_objects,
+ lmmk->lmm_stripe_count);
+ }
+ if (lum.lmm_magic == LOV_USER_MAGIC) {
+ /* User request for v1, we need skip lmm_pool_name */
+ if (lmmk->lmm_magic == LOV_MAGIC_V3) {
+ memmove((char *)(&lmmk->lmm_stripe_count) +
+ sizeof(lmmk->lmm_stripe_count),
+ ((struct lov_mds_md_v3 *)lmmk)->lmm_objects,
+ lmmk->lmm_stripe_count *
+ sizeof(struct lov_ost_data_v1));
+ lmm_size -= LOV_MAXPOOLNAME;
+ }
+ } else {
+ /* if v3 we just have to update the lum_size */
+ lum_size = sizeof(struct lov_user_md_v3);
+ }
+
+ /* User wasn't expecting this many OST entries */
+ if (lum.lmm_stripe_count == 0)
+ lmm_size = lum_size;
+ else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) {
+ rc = -EOVERFLOW;
+ goto out_set;
+ }
+ /*
+ * Have a difference between lov_mds_md & lov_user_md.
+ * So we have to re-order the data before copy to user.
+ */
+ lum.lmm_stripe_count = lmmk->lmm_stripe_count;
+ lum.lmm_layout_gen = lmmk->lmm_layout_gen;
+ ((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen;
+ ((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count;
+ if (copy_to_user(lump, lmmk, lmm_size))
+ rc = -EFAULT;
+
+ obd_free_diskmd(exp, &lmmk);
+out_set:
+ set_fs(seg);
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c
new file mode 100644
index 000000000..c4596e8e5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_page.c
@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_page_invariant(const struct cl_page_slice *slice)
+{
+ const struct cl_page *page = slice->cpl_page;
+ const struct cl_page *sub = lov_sub_page(slice);
+
+ return ergo(sub != NULL,
+ page->cp_child == sub &&
+ sub->cp_parent == page &&
+ page->cp_state == sub->cp_state);
+}
+
+static void lov_page_fini(const struct lu_env *env,
+ struct cl_page_slice *slice)
+{
+ struct cl_page *sub = lov_sub_page(slice);
+
+ LINVRNT(lov_page_invariant(slice));
+
+ if (sub != NULL) {
+ LASSERT(sub->cp_state == CPS_FREEING);
+ lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent);
+ sub->cp_parent = NULL;
+ slice->cpl_page->cp_child = NULL;
+ cl_page_put(env, sub);
+ }
+}
+
+static int lov_page_own(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io,
+ int nonblock)
+{
+ struct lov_io *lio = lov_env_io(env);
+ struct lov_io_sub *sub;
+
+ LINVRNT(lov_page_invariant(slice));
+ LINVRNT(!cl2lov_page(slice)->lps_invalid);
+
+ sub = lov_page_subio(env, lio, slice);
+ if (!IS_ERR(sub)) {
+ lov_sub_page(slice)->cp_owner = sub->sub_io;
+ lov_sub_put(sub);
+ } else
+ LBUG(); /* Arrgh */
+ return 0;
+}
+
+static void lov_page_assume(const struct lu_env *env,
+ const struct cl_page_slice *slice, struct cl_io *io)
+{
+ lov_page_own(env, slice, io, 0);
+}
+
+static int lov_page_cache_add(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io)
+{
+ struct lov_io *lio = lov_env_io(env);
+ struct lov_io_sub *sub;
+ int rc = 0;
+
+ LINVRNT(lov_page_invariant(slice));
+ LINVRNT(!cl2lov_page(slice)->lps_invalid);
+
+ sub = lov_page_subio(env, lio, slice);
+ if (!IS_ERR(sub)) {
+ rc = cl_page_cache_add(sub->sub_env, sub->sub_io,
+ slice->cpl_page->cp_child, CRT_WRITE);
+ lov_sub_put(sub);
+ } else {
+ rc = PTR_ERR(sub);
+ CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc);
+ }
+ return rc;
+}
+
+static int lov_page_print(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t printer)
+{
+ struct lov_page *lp = cl2lov_page(slice);
+
+ return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_page_ops = {
+ .cpo_fini = lov_page_fini,
+ .cpo_own = lov_page_own,
+ .cpo_assume = lov_page_assume,
+ .io = {
+ [CRT_WRITE] = {
+ .cpo_cache_add = lov_page_cache_add
+ }
+ },
+ .cpo_print = lov_page_print
+};
+
+static void lov_empty_page_fini(const struct lu_env *env,
+ struct cl_page_slice *slice)
+{
+ LASSERT(slice->cpl_page->cp_child == NULL);
+}
+
+int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage)
+{
+ struct lov_object *loo = cl2lov(obj);
+ struct lov_layout_raid0 *r0 = lov_r0(loo);
+ struct lov_io *lio = lov_env_io(env);
+ struct cl_page *subpage;
+ struct cl_object *subobj;
+ struct lov_io_sub *sub;
+ struct lov_page *lpg = cl_object_page_slice(obj, page);
+ loff_t offset;
+ u64 suboff;
+ int stripe;
+ int rc;
+
+ offset = cl_offset(obj, page->cp_index);
+ stripe = lov_stripe_number(loo->lo_lsm, offset);
+ LASSERT(stripe < r0->lo_nr);
+ rc = lov_stripe_offset(loo->lo_lsm, offset, stripe,
+ &suboff);
+ LASSERT(rc == 0);
+
+ lpg->lps_invalid = 1;
+ cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops);
+
+ sub = lov_sub_get(env, lio, stripe);
+ if (IS_ERR(sub)) {
+ rc = PTR_ERR(sub);
+ goto out;
+ }
+
+ subobj = lovsub2cl(r0->lo_sub[stripe]);
+ subpage = cl_page_find_sub(sub->sub_env, subobj,
+ cl_index(subobj, suboff), vmpage, page);
+ lov_sub_put(sub);
+ if (IS_ERR(subpage)) {
+ rc = PTR_ERR(subpage);
+ goto out;
+ }
+
+ if (likely(subpage->cp_parent == page)) {
+ lu_ref_add(&subpage->cp_reference, "lov", page);
+ lpg->lps_invalid = 0;
+ rc = 0;
+ } else {
+ CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n");
+ CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n");
+ LASSERT(0);
+ }
+
+out:
+ return rc;
+}
+
+
+static const struct cl_page_operations lov_empty_page_ops = {
+ .cpo_fini = lov_empty_page_fini,
+ .cpo_print = lov_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage)
+{
+ struct lov_page *lpg = cl_object_page_slice(obj, page);
+ void *addr;
+
+ cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops);
+ addr = kmap(vmpage);
+ memset(addr, 0, cl_page_size(obj));
+ kunmap(vmpage);
+ cl_page_export(env, page, 1);
+ return 0;
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c
new file mode 100644
index 000000000..d96163de7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_pool.c
@@ -0,0 +1,673 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+ _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+ CDEBUG(D_INFO, "pool %p\n", pool);
+ atomic_inc(&pool->pool_refcount);
+}
+
+void lov_pool_putref(struct pool_desc *pool)
+{
+ CDEBUG(D_INFO, "pool %p\n", pool);
+ if (atomic_dec_and_test(&pool->pool_refcount)) {
+ LASSERT(hlist_unhashed(&pool->pool_hash));
+ LASSERT(list_empty(&pool->pool_list));
+ LASSERT(pool->pool_proc_entry == NULL);
+ lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+ lov_ost_pool_free(&(pool->pool_obds));
+ OBD_FREE_PTR(pool);
+ }
+}
+
+static void lov_pool_putref_locked(struct pool_desc *pool)
+{
+ CDEBUG(D_INFO, "pool %p\n", pool);
+ LASSERT(atomic_read(&pool->pool_refcount) > 1);
+
+ atomic_dec(&pool->pool_refcount);
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(struct cfs_hash *hash_body, const void *key, unsigned mask)
+{
+ int i;
+ __u32 result;
+ char *poolname;
+
+ result = 0;
+ poolname = (char *)key;
+ for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+ if (poolname[i] == '\0')
+ break;
+ result = (result << 4)^(result >> 28) ^ poolname[i];
+ }
+ return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+ struct pool_desc *pool;
+
+ pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+ return pool->pool_name;
+}
+
+static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode)
+{
+ char *pool_name;
+ struct pool_desc *pool;
+
+ pool_name = (char *)key;
+ pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+ return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+}
+
+static void *pool_hashobject(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct pool_desc, pool_hash);
+}
+
+static void pool_hashrefcount_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct pool_desc *pool;
+
+ pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+ lov_pool_getref(pool);
+}
+
+static void pool_hashrefcount_put_locked(struct cfs_hash *hs,
+ struct hlist_node *hnode)
+{
+ struct pool_desc *pool;
+
+ pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+ lov_pool_putref_locked(pool);
+}
+
+cfs_hash_ops_t pool_hash_operations = {
+ .hs_hash = pool_hashfn,
+ .hs_key = pool_key,
+ .hs_keycmp = pool_hashkey_keycmp,
+ .hs_object = pool_hashobject,
+ .hs_get = pool_hashrefcount_get,
+ .hs_put_locked = pool_hashrefcount_put_locked,
+
+};
+
+#if defined (CONFIG_PROC_FS)
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+ int magic;
+ struct pool_desc *pool;
+ int idx; /* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct pool_iterator *iter = (struct pool_iterator *)s->private;
+ int prev_idx;
+
+ LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+
+ /* test if end of file */
+ if (*pos >= pool_tgt_count(iter->pool))
+ return NULL;
+
+ /* iterate to find a non empty entry */
+ prev_idx = iter->idx;
+ down_read(&pool_tgt_rw_sem(iter->pool));
+ iter->idx++;
+ if (iter->idx == pool_tgt_count(iter->pool)) {
+ iter->idx = prev_idx; /* we stay on the last entry */
+ up_read(&pool_tgt_rw_sem(iter->pool));
+ return NULL;
+ }
+ up_read(&pool_tgt_rw_sem(iter->pool));
+ (*pos)++;
+ /* return != NULL to continue */
+ return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+ struct pool_desc *pool = (struct pool_desc *)s->private;
+ struct pool_iterator *iter;
+
+ lov_pool_getref(pool);
+ if ((pool_tgt_count(pool) == 0) ||
+ (*pos >= pool_tgt_count(pool))) {
+ /* iter is not created, so stop() has no way to
+ * find pool to dec ref */
+ lov_pool_putref(pool);
+ return NULL;
+ }
+
+ OBD_ALLOC_PTR(iter);
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+ iter->magic = POOL_IT_MAGIC;
+ iter->pool = pool;
+ iter->idx = 0;
+
+ /* we use seq_file private field to memorized iterator so
+ * we can free it at stop() */
+ /* /!\ do not forget to restore it to pool before freeing it */
+ s->private = iter;
+ if (*pos > 0) {
+ loff_t i;
+ void *ptr;
+
+ i = 0;
+ do {
+ ptr = pool_proc_next(s, &iter, &i);
+ } while ((i < *pos) && (ptr != NULL));
+ return ptr;
+ }
+ return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+ struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+ /* in some cases stop() method is called 2 times, without
+ * calling start() method (see seq_read() from fs/seq_file.c)
+ * we have to free only if s->private is an iterator */
+ if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+ /* we restore s->private so next call to pool_proc_start()
+ * will work */
+ s->private = iter->pool;
+ lov_pool_putref(iter->pool);
+ OBD_FREE_PTR(iter);
+ }
+ return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+ struct pool_iterator *iter = (struct pool_iterator *)v;
+ struct lov_tgt_desc *tgt;
+
+ LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+ LASSERT(iter->pool != NULL);
+ LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+ down_read(&pool_tgt_rw_sem(iter->pool));
+ tgt = pool_tgt(iter->pool, iter->idx);
+ up_read(&pool_tgt_rw_sem(iter->pool));
+ if (tgt)
+ seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+ return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+ .start = pool_proc_start,
+ .next = pool_proc_next,
+ .stop = pool_proc_stop,
+ .show = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+ int rc;
+
+ rc = seq_open(file, &pool_proc_ops);
+ if (!rc) {
+ struct seq_file *s = file->private_data;
+ s->private = PDE_DATA(inode);
+ }
+ return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+ .open = pool_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+ int i;
+
+ lov_pool_getref(pool);
+
+ CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+ pool->pool_name, pool->pool_obds.op_count);
+ down_read(&pool_tgt_rw_sem(pool));
+
+ for (i = 0; i < pool_tgt_count(pool) ; i++) {
+ if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+ continue;
+ CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+ pool->pool_name, i,
+ obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+ }
+
+ up_read(&pool_tgt_rw_sem(pool));
+ lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+ if (count == 0)
+ count = LOV_POOL_INIT_COUNT;
+ op->op_array = NULL;
+ op->op_count = 0;
+ init_rwsem(&op->op_rw_sem);
+ op->op_size = count;
+ OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+ if (op->op_array == NULL) {
+ op->op_size = 0;
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+{
+ __u32 *new;
+ int new_size;
+
+ LASSERT(min_count != 0);
+
+ if (op->op_count < op->op_size)
+ return 0;
+
+ new_size = max(min_count, 2 * op->op_size);
+ OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+ if (new == NULL)
+ return -ENOMEM;
+
+ /* copy old array to new one */
+ memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+ OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+ op->op_array = new;
+ op->op_size = new_size;
+ return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+{
+ int rc = 0, i;
+
+ down_write(&op->op_rw_sem);
+
+ rc = lov_ost_pool_extend(op, min_count);
+ if (rc)
+ goto out;
+
+ /* search ost in pool array */
+ for (i = 0; i < op->op_count; i++) {
+ if (op->op_array[i] == idx) {
+ rc = -EEXIST;
+ goto out;
+ }
+ }
+ /* ost not found we add it */
+ op->op_array[op->op_count] = idx;
+ op->op_count++;
+out:
+ up_write(&op->op_rw_sem);
+ return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+ int i;
+
+ down_write(&op->op_rw_sem);
+
+ for (i = 0; i < op->op_count; i++) {
+ if (op->op_array[i] == idx) {
+ memmove(&op->op_array[i], &op->op_array[i + 1],
+ (op->op_count - i - 1) * sizeof(op->op_array[0]));
+ op->op_count--;
+ up_write(&op->op_rw_sem);
+ return 0;
+ }
+ }
+
+ up_write(&op->op_rw_sem);
+ return -EINVAL;
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+ if (op->op_size == 0)
+ return 0;
+
+ down_write(&op->op_rw_sem);
+
+ OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+ op->op_array = NULL;
+ op->op_count = 0;
+ op->op_size = 0;
+
+ up_write(&op->op_rw_sem);
+ return 0;
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+ struct lov_obd *lov;
+ struct pool_desc *new_pool;
+ int rc;
+
+ lov = &(obd->u.lov);
+
+ if (strlen(poolname) > LOV_MAXPOOLNAME)
+ return -ENAMETOOLONG;
+
+ OBD_ALLOC_PTR(new_pool);
+ if (new_pool == NULL)
+ return -ENOMEM;
+
+ strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
+ new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+ new_pool->pool_lobd = obd;
+ /* ref count init to 1 because when created a pool is always used
+ * up to deletion
+ */
+ atomic_set(&new_pool->pool_refcount, 1);
+ rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+ if (rc)
+ goto out_err;
+
+ memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+ rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+ if (rc)
+ goto out_free_pool_obds;
+
+ INIT_HLIST_NODE(&new_pool->pool_hash);
+
+#if defined (CONFIG_PROC_FS)
+ /* we need this assert seq_file is not implemented for liblustre */
+ /* get ref for /proc file */
+ lov_pool_getref(new_pool);
+ new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+ poolname, new_pool,
+ &pool_proc_operations);
+ if (IS_ERR(new_pool->pool_proc_entry)) {
+ CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+ new_pool->pool_proc_entry = NULL;
+ lov_pool_putref(new_pool);
+ }
+ CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry);
+#endif
+
+ spin_lock(&obd->obd_dev_lock);
+ list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+ lov->lov_pool_count++;
+ spin_unlock(&obd->obd_dev_lock);
+
+ /* add to find only when it fully ready */
+ rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname,
+ &new_pool->pool_hash);
+ if (rc) {
+ rc = -EEXIST;
+ goto out_err;
+ }
+
+ CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+ poolname, lov->lov_pool_count);
+
+ return 0;
+
+out_err:
+ spin_lock(&obd->obd_dev_lock);
+ list_del_init(&new_pool->pool_list);
+ lov->lov_pool_count--;
+ spin_unlock(&obd->obd_dev_lock);
+
+ lprocfs_remove(&new_pool->pool_proc_entry);
+
+ lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+out_free_pool_obds:
+ lov_ost_pool_free(&new_pool->pool_obds);
+ OBD_FREE_PTR(new_pool);
+ return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+ struct lov_obd *lov;
+ struct pool_desc *pool;
+
+ lov = &(obd->u.lov);
+
+ /* lookup and kill hash reference */
+ pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname);
+ if (pool == NULL)
+ return -ENOENT;
+
+ if (pool->pool_proc_entry != NULL) {
+ CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+ lprocfs_remove(&pool->pool_proc_entry);
+ lov_pool_putref(pool);
+ }
+
+ spin_lock(&obd->obd_dev_lock);
+ list_del_init(&pool->pool_list);
+ lov->lov_pool_count--;
+ spin_unlock(&obd->obd_dev_lock);
+
+ /* release last reference */
+ lov_pool_putref(pool);
+
+ return 0;
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+ struct obd_uuid ost_uuid;
+ struct lov_obd *lov;
+ struct pool_desc *pool;
+ unsigned int lov_idx;
+ int rc;
+
+ lov = &(obd->u.lov);
+
+ pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+ if (pool == NULL)
+ return -ENOENT;
+
+ obd_str2uuid(&ost_uuid, ostname);
+
+
+ /* search ost in lov array */
+ obd_getref(obd);
+ for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+ if (!lov->lov_tgts[lov_idx])
+ continue;
+ if (obd_uuid_equals(&ost_uuid,
+ &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+ break;
+ }
+ /* test if ost found in lov */
+ if (lov_idx == lov->desc.ld_tgt_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+ if (rc)
+ goto out;
+
+ pool->pool_rr.lqr_dirty = 1;
+
+ CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+ ostname, poolname, pool_tgt_count(pool));
+
+out:
+ obd_putref(obd);
+ lov_pool_putref(pool);
+ return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+ struct obd_uuid ost_uuid;
+ struct lov_obd *lov;
+ struct pool_desc *pool;
+ unsigned int lov_idx;
+ int rc = 0;
+
+ lov = &(obd->u.lov);
+
+ pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+ if (pool == NULL)
+ return -ENOENT;
+
+ obd_str2uuid(&ost_uuid, ostname);
+
+ obd_getref(obd);
+ /* search ost in lov array, to get index */
+ for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+ if (!lov->lov_tgts[lov_idx])
+ continue;
+
+ if (obd_uuid_equals(&ost_uuid,
+ &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+ break;
+ }
+
+ /* test if ost found in lov */
+ if (lov_idx == lov->desc.ld_tgt_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+ pool->pool_rr.lqr_dirty = 1;
+
+ CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+ poolname);
+
+out:
+ obd_putref(obd);
+ lov_pool_putref(pool);
+ return rc;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+ int i, rc;
+
+ /* caller may no have a ref on pool if it got the pool
+ * without calling lov_find_pool() (e.g. go through the lov pool
+ * list)
+ */
+ lov_pool_getref(pool);
+
+ down_read(&pool_tgt_rw_sem(pool));
+
+ for (i = 0; i < pool_tgt_count(pool); i++) {
+ if (pool_tgt_array(pool)[i] == idx) {
+ rc = 0;
+ goto out;
+ }
+ }
+ rc = -ENOENT;
+out:
+ up_read(&pool_tgt_rw_sem(pool));
+
+ lov_pool_putref(pool);
+ return rc;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+ struct pool_desc *pool;
+
+ pool = NULL;
+ if (poolname[0] != '\0') {
+ pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+ if (pool == NULL)
+ CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n",
+ poolname);
+ if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+ CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
+ poolname);
+ /* pool is ignored, so we remove ref on it */
+ lov_pool_putref(pool);
+ pool = NULL;
+ }
+ }
+ return pool;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c
new file mode 100644
index 000000000..933e2d1f8
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lov_request.c
@@ -0,0 +1,773 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_idl.h"
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+ set->set_count = 0;
+ atomic_set(&set->set_completes, 0);
+ atomic_set(&set->set_success, 0);
+ atomic_set(&set->set_finish_checked, 0);
+ set->set_cookies = NULL;
+ INIT_LIST_HEAD(&set->set_list);
+ atomic_set(&set->set_refcount, 1);
+ init_waitqueue_head(&set->set_waitq);
+ spin_lock_init(&set->set_lock);
+}
+
+void lov_finish_set(struct lov_request_set *set)
+{
+ struct list_head *pos, *n;
+
+ LASSERT(set);
+ list_for_each_safe(pos, n, &set->set_list) {
+ struct lov_request *req = list_entry(pos,
+ struct lov_request,
+ rq_link);
+ list_del_init(&req->rq_link);
+
+ if (req->rq_oi.oi_oa)
+ OBDO_FREE(req->rq_oi.oi_oa);
+ if (req->rq_oi.oi_md)
+ OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+ if (req->rq_oi.oi_osfs)
+ OBD_FREE(req->rq_oi.oi_osfs,
+ sizeof(*req->rq_oi.oi_osfs));
+ OBD_FREE(req, sizeof(*req));
+ }
+
+ if (set->set_pga) {
+ int len = set->set_oabufs * sizeof(*set->set_pga);
+ OBD_FREE_LARGE(set->set_pga, len);
+ }
+ if (set->set_lockh)
+ lov_llh_put(set->set_lockh);
+
+ OBD_FREE(set, sizeof(*set));
+}
+
+int lov_set_finished(struct lov_request_set *set, int idempotent)
+{
+ int completes = atomic_read(&set->set_completes);
+
+ CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count);
+
+ if (completes == set->set_count) {
+ if (idempotent)
+ return 1;
+ if (atomic_inc_return(&set->set_finish_checked) == 1)
+ return 1;
+ }
+ return 0;
+}
+
+void lov_update_set(struct lov_request_set *set,
+ struct lov_request *req, int rc)
+{
+ req->rq_complete = 1;
+ req->rq_rc = rc;
+
+ atomic_inc(&set->set_completes);
+ if (rc == 0)
+ atomic_inc(&set->set_success);
+
+ wake_up(&set->set_waitq);
+}
+
+int lov_update_common_set(struct lov_request_set *set,
+ struct lov_request *req, int rc)
+{
+ struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+
+ lov_update_set(set, req, rc);
+
+ /* grace error on inactive ost */
+ if (rc && !(lov->lov_tgts[req->rq_idx] &&
+ lov->lov_tgts[req->rq_idx]->ltd_active))
+ rc = 0;
+
+ /* FIXME in raid1 regime, should return 0 */
+ return rc;
+}
+
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+ list_add_tail(&req->rq_link, &set->set_list);
+ set->set_count++;
+ req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+ int rc;
+ struct lov_tgt_desc *tgt;
+
+ mutex_lock(&lov->lov_lock);
+ tgt = lov->lov_tgts[idx];
+ rc = !tgt || tgt->ltd_active ||
+ (tgt->ltd_exp &&
+ class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried);
+ mutex_unlock(&lov->lov_lock);
+
+ return rc;
+}
+
+/* Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+ wait_queue_head_t waitq;
+ struct l_wait_info lwi;
+ struct lov_tgt_desc *tgt;
+ int rc = 0;
+
+ mutex_lock(&lov->lov_lock);
+
+ tgt = lov->lov_tgts[ost_idx];
+
+ if (unlikely(tgt == NULL)) {
+ rc = 0;
+ goto out;
+ }
+
+ if (likely(tgt->ltd_active)) {
+ rc = 1;
+ goto out;
+ }
+
+ if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried) {
+ rc = 0;
+ goto out;
+ }
+
+ mutex_unlock(&lov->lov_lock);
+
+ init_waitqueue_head(&waitq);
+ lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
+ cfs_time_seconds(1), NULL, NULL);
+
+ rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
+ if (tgt != NULL && tgt->ltd_active)
+ return 1;
+
+ return 0;
+
+out:
+ mutex_unlock(&lov->lov_lock);
+ return rc;
+}
+
+static int common_attr_done(struct lov_request_set *set)
+{
+ struct list_head *pos;
+ struct lov_request *req;
+ struct obdo *tmp_oa;
+ int rc = 0, attrset = 0;
+
+ LASSERT(set->set_oi != NULL);
+
+ if (set->set_oi->oi_oa == NULL)
+ return 0;
+
+ if (!atomic_read(&set->set_success))
+ return -EIO;
+
+ OBDO_ALLOC(tmp_oa);
+ if (tmp_oa == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ list_for_each(pos, &set->set_list) {
+ req = list_entry(pos, struct lov_request, rq_link);
+
+ if (!req->rq_complete || req->rq_rc)
+ continue;
+ if (req->rq_oi.oi_oa->o_valid == 0) /* inactive stripe */
+ continue;
+ lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
+ req->rq_oi.oi_oa->o_valid,
+ set->set_oi->oi_md, req->rq_stripe, &attrset);
+ }
+ if (!attrset) {
+ CERROR("No stripes had valid attrs\n");
+ rc = -EIO;
+ }
+ if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) &&
+ (set->set_oi->oi_md->lsm_stripe_count != attrset)) {
+ /* When we take attributes of some epoch, we require all the
+ * ost to be active. */
+ CERROR("Not all the stripes had valid attrs\n");
+ rc = -EIO;
+ goto out;
+ }
+
+ tmp_oa->o_oi = set->set_oi->oi_oa->o_oi;
+ memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
+out:
+ if (tmp_oa)
+ OBDO_FREE(tmp_oa);
+ return rc;
+
+}
+
+int lov_fini_getattr_set(struct lov_request_set *set)
+{
+ int rc = 0;
+
+ if (set == NULL)
+ return 0;
+ LASSERT(set->set_exp);
+ if (atomic_read(&set->set_completes))
+ rc = common_attr_done(set);
+
+ lov_put_reqset(set);
+
+ return rc;
+}
+
+/* The callback for osc_getattr_async that finalizes a request info when a
+ * response is received. */
+static int cb_getattr_update(void *cookie, int rc)
+{
+ struct obd_info *oinfo = cookie;
+ struct lov_request *lovreq;
+
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct lov_request_set **reqset)
+{
+ struct lov_request_set *set;
+ struct lov_obd *lov = &exp->exp_obd->u.lov;
+ int rc = 0, i;
+
+ OBD_ALLOC(set, sizeof(*set));
+ if (set == NULL)
+ return -ENOMEM;
+ lov_init_set(set);
+
+ set->set_exp = exp;
+ set->set_oi = oinfo;
+
+ for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+ struct lov_oinfo *loi;
+ struct lov_request *req;
+
+ loi = oinfo->oi_md->lsm_oinfo[i];
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+ CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH) {
+ /* SOM requires all the OSTs to be active. */
+ rc = -EIO;
+ goto out_set;
+ }
+ continue;
+ }
+
+ OBD_ALLOC(req, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out_set;
+ }
+
+ req->rq_stripe = i;
+ req->rq_idx = loi->loi_ost_idx;
+
+ OBDO_ALLOC(req->rq_oi.oi_oa);
+ if (req->rq_oi.oi_oa == NULL) {
+ OBD_FREE(req, sizeof(*req));
+ rc = -ENOMEM;
+ goto out_set;
+ }
+ memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+ sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+ req->rq_oi.oi_cb_up = cb_getattr_update;
+ req->rq_oi.oi_capa = oinfo->oi_capa;
+
+ lov_set_add_req(req, set);
+ }
+ if (!set->set_count) {
+ rc = -EIO;
+ goto out_set;
+ }
+ *reqset = set;
+ return rc;
+out_set:
+ lov_fini_getattr_set(set);
+ return rc;
+}
+
+int lov_fini_destroy_set(struct lov_request_set *set)
+{
+ if (set == NULL)
+ return 0;
+ LASSERT(set->set_exp);
+ if (atomic_read(&set->set_completes)) {
+ /* FIXME update qos data here */
+ }
+
+ lov_put_reqset(set);
+
+ return 0;
+}
+
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obdo *src_oa, struct lov_stripe_md *lsm,
+ struct obd_trans_info *oti,
+ struct lov_request_set **reqset)
+{
+ struct lov_request_set *set;
+ struct lov_obd *lov = &exp->exp_obd->u.lov;
+ int rc = 0, i;
+
+ OBD_ALLOC(set, sizeof(*set));
+ if (set == NULL)
+ return -ENOMEM;
+ lov_init_set(set);
+
+ set->set_exp = exp;
+ set->set_oi = oinfo;
+ set->set_oi->oi_md = lsm;
+ set->set_oi->oi_oa = src_oa;
+ set->set_oti = oti;
+ if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
+ set->set_cookies = oti->oti_logcookies;
+
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ struct lov_oinfo *loi;
+ struct lov_request *req;
+
+ loi = lsm->lsm_oinfo[i];
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+ CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+ continue;
+ }
+
+ OBD_ALLOC(req, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out_set;
+ }
+
+ req->rq_stripe = i;
+ req->rq_idx = loi->loi_ost_idx;
+
+ OBDO_ALLOC(req->rq_oi.oi_oa);
+ if (req->rq_oi.oi_oa == NULL) {
+ OBD_FREE(req, sizeof(*req));
+ rc = -ENOMEM;
+ goto out_set;
+ }
+ memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+ lov_set_add_req(req, set);
+ }
+ if (!set->set_count) {
+ rc = -EIO;
+ goto out_set;
+ }
+ *reqset = set;
+ return rc;
+out_set:
+ lov_fini_destroy_set(set);
+ return rc;
+}
+
+int lov_fini_setattr_set(struct lov_request_set *set)
+{
+ int rc = 0;
+
+ if (set == NULL)
+ return 0;
+ LASSERT(set->set_exp);
+ if (atomic_read(&set->set_completes)) {
+ rc = common_attr_done(set);
+ /* FIXME update qos data here */
+ }
+
+ lov_put_reqset(set);
+ return rc;
+}
+
+int lov_update_setattr_set(struct lov_request_set *set,
+ struct lov_request *req, int rc)
+{
+ struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+ struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+
+ lov_update_set(set, req, rc);
+
+ /* grace error on inactive ost */
+ if (rc && !(lov->lov_tgts[req->rq_idx] &&
+ lov->lov_tgts[req->rq_idx]->ltd_active))
+ rc = 0;
+
+ if (rc == 0) {
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
+ lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime =
+ req->rq_oi.oi_oa->o_ctime;
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
+ lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime =
+ req->rq_oi.oi_oa->o_mtime;
+ if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
+ lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime =
+ req->rq_oi.oi_oa->o_atime;
+ }
+
+ return rc;
+}
+
+/* The callback for osc_setattr_async that finalizes a request info when a
+ * response is received. */
+static int cb_setattr_update(void *cookie, int rc)
+{
+ struct obd_info *oinfo = cookie;
+ struct lov_request *lovreq;
+
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct lov_request_set **reqset)
+{
+ struct lov_request_set *set;
+ struct lov_obd *lov = &exp->exp_obd->u.lov;
+ int rc = 0, i;
+
+ OBD_ALLOC(set, sizeof(*set));
+ if (set == NULL)
+ return -ENOMEM;
+ lov_init_set(set);
+
+ set->set_exp = exp;
+ set->set_oti = oti;
+ set->set_oi = oinfo;
+ if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+ set->set_cookies = oti->oti_logcookies;
+
+ for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+ struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+ struct lov_request *req;
+
+ if (lov_oinfo_is_dummy(loi))
+ continue;
+
+ if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+ CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+ continue;
+ }
+
+ OBD_ALLOC(req, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out_set;
+ }
+ req->rq_stripe = i;
+ req->rq_idx = loi->loi_ost_idx;
+
+ OBDO_ALLOC(req->rq_oi.oi_oa);
+ if (req->rq_oi.oi_oa == NULL) {
+ OBD_FREE(req, sizeof(*req));
+ rc = -ENOMEM;
+ goto out_set;
+ }
+ memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+ sizeof(*req->rq_oi.oi_oa));
+ req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+ req->rq_oi.oi_oa->o_stripe_idx = i;
+ req->rq_oi.oi_cb_up = cb_setattr_update;
+ req->rq_oi.oi_capa = oinfo->oi_capa;
+
+ if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
+ int off = lov_stripe_offset(oinfo->oi_md,
+ oinfo->oi_oa->o_size, i,
+ &req->rq_oi.oi_oa->o_size);
+
+ if (off < 0 && req->rq_oi.oi_oa->o_size)
+ req->rq_oi.oi_oa->o_size--;
+
+ CDEBUG(D_INODE, "stripe %d has size %llu/%llu\n",
+ i, req->rq_oi.oi_oa->o_size,
+ oinfo->oi_oa->o_size);
+ }
+ lov_set_add_req(req, set);
+ }
+ if (!set->set_count) {
+ rc = -EIO;
+ goto out_set;
+ }
+ *reqset = set;
+ return rc;
+out_set:
+ lov_fini_setattr_set(set);
+ return rc;
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add) \
+ do { \
+ if ((tot) + (add) < (tot)) \
+ (tot) = LOV_U64_MAX; \
+ else \
+ (tot) += (add); \
+ } while (0)
+
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+ int success)
+{
+ if (success) {
+ __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov,
+ LOV_MAGIC, 0);
+ if (osfs->os_files != LOV_U64_MAX)
+ lov_do_div64(osfs->os_files, expected_stripes);
+ if (osfs->os_ffree != LOV_U64_MAX)
+ lov_do_div64(osfs->os_ffree, expected_stripes);
+
+ spin_lock(&obd->obd_osfs_lock);
+ memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+ obd->obd_osfs_age = cfs_time_current_64();
+ spin_unlock(&obd->obd_osfs_lock);
+ return 0;
+ }
+
+ return -EIO;
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+ int rc = 0;
+
+ if (set == NULL)
+ return 0;
+
+ if (atomic_read(&set->set_completes)) {
+ rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+ atomic_read(&set->set_success));
+ }
+ lov_put_reqset(set);
+ return rc;
+}
+
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+ int success)
+{
+ int shift = 0, quit = 0;
+ __u64 tmp;
+
+ if (success == 0) {
+ memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+ } else {
+ if (osfs->os_bsize != lov_sfs->os_bsize) {
+ /* assume all block sizes are always powers of 2 */
+ /* get the bits difference */
+ tmp = osfs->os_bsize | lov_sfs->os_bsize;
+ for (shift = 0; shift <= 64; ++shift) {
+ if (tmp & 1) {
+ if (quit)
+ break;
+ else
+ quit = 1;
+ shift = 0;
+ }
+ tmp >>= 1;
+ }
+ }
+
+ if (osfs->os_bsize < lov_sfs->os_bsize) {
+ osfs->os_bsize = lov_sfs->os_bsize;
+
+ osfs->os_bfree >>= shift;
+ osfs->os_bavail >>= shift;
+ osfs->os_blocks >>= shift;
+ } else if (shift != 0) {
+ lov_sfs->os_bfree >>= shift;
+ lov_sfs->os_bavail >>= shift;
+ lov_sfs->os_blocks >>= shift;
+ }
+ osfs->os_bfree += lov_sfs->os_bfree;
+ osfs->os_bavail += lov_sfs->os_bavail;
+ osfs->os_blocks += lov_sfs->os_blocks;
+ /* XXX not sure about this one - depends on policy.
+ * - could be minimum if we always stripe on all OBDs
+ * (but that would be wrong for any other policy,
+ * if one of the OBDs has no more objects left)
+ * - could be sum if we stripe whole objects
+ * - could be average, just to give a nice number
+ *
+ * To give a "reasonable" (if not wholly accurate)
+ * number, we divide the total number of free objects
+ * by expected stripe count (watch out for overflow).
+ */
+ LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+ LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+ }
+}
+
+/* The callback for osc_statfs_async that finalizes a request info when a
+ * response is received. */
+static int cb_statfs_update(void *cookie, int rc)
+{
+ struct obd_info *oinfo = cookie;
+ struct lov_request *lovreq;
+ struct lov_request_set *set;
+ struct obd_statfs *osfs, *lov_sfs;
+ struct lov_obd *lov;
+ struct lov_tgt_desc *tgt;
+ struct obd_device *lovobd, *tgtobd;
+ int success;
+
+ lovreq = container_of(oinfo, struct lov_request, rq_oi);
+ set = lovreq->rq_rqset;
+ lovobd = set->set_obd;
+ lov = &lovobd->u.lov;
+ osfs = set->set_oi->oi_osfs;
+ lov_sfs = oinfo->oi_osfs;
+ success = atomic_read(&set->set_success);
+ /* XXX: the same is done in lov_update_common_set, however
+ lovset->set_exp is not initialized. */
+ lov_update_set(set, lovreq, rc);
+ if (rc)
+ goto out;
+
+ obd_getref(lovobd);
+ tgt = lov->lov_tgts[lovreq->rq_idx];
+ if (!tgt || !tgt->ltd_active)
+ goto out_update;
+
+ tgtobd = class_exp2obd(tgt->ltd_exp);
+ spin_lock(&tgtobd->obd_osfs_lock);
+ memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+ if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+ tgtobd->obd_osfs_age = cfs_time_current_64();
+ spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+ lov_update_statfs(osfs, lov_sfs, success);
+ obd_putref(lovobd);
+
+out:
+ if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
+ lov_set_finished(set, 0)) {
+ lov_statfs_interpret(NULL, set, set->set_count !=
+ atomic_read(&set->set_success));
+ }
+
+ return 0;
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+ struct lov_request_set **reqset)
+{
+ struct lov_request_set *set;
+ struct lov_obd *lov = &obd->u.lov;
+ int rc = 0, i;
+
+ OBD_ALLOC(set, sizeof(*set));
+ if (set == NULL)
+ return -ENOMEM;
+ lov_init_set(set);
+
+ set->set_obd = obd;
+ set->set_oi = oinfo;
+
+ /* We only get block data from the OBD */
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ struct lov_request *req;
+
+ if (lov->lov_tgts[i] == NULL ||
+ (!lov_check_and_wait_active(lov, i) &&
+ (oinfo->oi_flags & OBD_STATFS_NODELAY))) {
+ CDEBUG(D_HA, "lov idx %d inactive\n", i);
+ continue;
+ }
+
+ /* skip targets that have been explicitly disabled by the
+ * administrator */
+ if (!lov->lov_tgts[i]->ltd_exp) {
+ CDEBUG(D_HA, "lov idx %d administratively disabled\n", i);
+ continue;
+ }
+
+ OBD_ALLOC(req, sizeof(*req));
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out_set;
+ }
+
+ OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+ if (req->rq_oi.oi_osfs == NULL) {
+ OBD_FREE(req, sizeof(*req));
+ rc = -ENOMEM;
+ goto out_set;
+ }
+
+ req->rq_idx = i;
+ req->rq_oi.oi_cb_up = cb_statfs_update;
+ req->rq_oi.oi_flags = oinfo->oi_flags;
+
+ lov_set_add_req(req, set);
+ }
+ if (!set->set_count) {
+ rc = -EIO;
+ goto out_set;
+ }
+ *reqset = set;
+ return rc;
+out_set:
+ lov_fini_statfs_set(set);
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/drivers/staging/lustre/lustre/lov/lovsub_dev.c
new file mode 100644
index 000000000..42336f13a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_dev.c
@@ -0,0 +1,209 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub transfer operations.
+ *
+ */
+
+static void lovsub_req_completion(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret)
+{
+ struct lovsub_req *lsr;
+
+ lsr = cl2lovsub_req(slice);
+ OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env,
+ const struct cl_req_slice *slice,
+ const struct cl_object *obj,
+ struct cl_req_attr *attr, u64 flags)
+{
+ struct lovsub_object *subobj;
+
+ subobj = cl2lovsub(obj);
+ /*
+ * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+ * unconditionally. It never changes anyway.
+ */
+ attr->cra_oa->o_stripe_idx = subobj->lso_index;
+}
+
+static const struct cl_req_operations lovsub_req_ops = {
+ .cro_attr_set = lovsub_req_attr_set,
+ .cro_completion = lovsub_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+ const char *name, struct lu_device *next)
+{
+ struct lovsub_device *lsd = lu2lovsub_dev(d);
+ struct lu_device_type *ldt;
+ int rc;
+
+ next->ld_site = d->ld_site;
+ ldt = next->ld_type;
+ LASSERT(ldt != NULL);
+ rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+ if (rc) {
+ next->ld_site = NULL;
+ return rc;
+ }
+
+ lu_device_get(next);
+ lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+ lsd->acid_next = lu2cl_dev(next);
+ return rc;
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+ struct lu_device *d)
+{
+ struct lu_device *next;
+ struct lovsub_device *lsd;
+
+ lsd = lu2lovsub_dev(d);
+ next = cl2lu_dev(lsd->acid_next);
+ lsd->acid_super = NULL;
+ lsd->acid_next = NULL;
+ return next;
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+ struct lu_device *d)
+{
+ struct lovsub_device *lsd = lu2lovsub_dev(d);
+ struct lu_device *next = cl2lu_dev(lsd->acid_next);
+
+ if (atomic_read(&d->ld_ref) && d->ld_site) {
+ LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+ lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer);
+ }
+ cl_device_fini(lu2cl_dev(d));
+ OBD_FREE_PTR(lsd);
+ return next;
+}
+
+static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req)
+{
+ struct lovsub_req *lsr;
+ int result;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, GFP_NOFS);
+ if (lsr != NULL) {
+ cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops);
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+ .ldo_object_alloc = lovsub_object_alloc,
+ .ldo_process_config = NULL,
+ .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations lovsub_cl_ops = {
+ .cdo_req_init = lovsub_req_init
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg)
+{
+ struct lu_device *d;
+ struct lovsub_device *lsd;
+
+ OBD_ALLOC_PTR(lsd);
+ if (lsd != NULL) {
+ int result;
+
+ result = cl_device_init(&lsd->acid_cl, t);
+ if (result == 0) {
+ d = lovsub2lu_dev(lsd);
+ d->ld_ops = &lovsub_lu_ops;
+ lsd->acid_cl.cd_ops = &lovsub_cl_ops;
+ } else
+ d = ERR_PTR(result);
+ } else
+ d = ERR_PTR(-ENOMEM);
+ return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+ .ldto_device_alloc = lovsub_device_alloc,
+ .ldto_device_free = lovsub_device_free,
+
+ .ldto_device_init = lovsub_device_init,
+ .ldto_device_fini = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+ .ldt_tags = LU_DEVICE_CL,
+ .ldt_name = LUSTRE_LOVSUB_NAME,
+ .ldt_ops = &lovsub_device_type_ops,
+ .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_io.c b/drivers/staging/lustre/lustre/lov/lovsub_io.c
new file mode 100644
index 000000000..783ec687a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_io.c
@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOVSUB layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub io operations.
+ *
+ */
+
+/* All trivial */
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/drivers/staging/lustre/lustre/lov/lovsub_lock.c
new file mode 100644
index 000000000..62b696d25
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_lock.c
@@ -0,0 +1,466 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+ struct cl_lock_slice *slice)
+{
+ struct lovsub_lock *lsl;
+
+ lsl = cl2lovsub_lock(slice);
+ LASSERT(list_empty(&lsl->lss_parents));
+ OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+}
+
+static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov)
+{
+ struct cl_lock *parent;
+
+ parent = lov->lls_cl.cls_lock;
+ cl_lock_get(parent);
+ lu_ref_add(&parent->cll_reference, "lovsub-parent", current);
+ cl_lock_mutex_get(env, parent);
+}
+
+static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov)
+{
+ struct cl_lock *parent;
+
+ parent = lov->lls_cl.cls_lock;
+ cl_lock_mutex_put(env, lov->lls_cl.cls_lock);
+ lu_ref_del(&parent->cll_reference, "lovsub-parent", current);
+ cl_lock_put(env, parent);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for lovsub layer, which
+ * method is called whenever sub-lock state changes. Propagates state change
+ * to the top-locks.
+ */
+static void lovsub_lock_state(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state state)
+{
+ struct lovsub_lock *sub = cl2lovsub_lock(slice);
+ struct lov_lock_link *scan;
+
+ LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+ list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+ struct lov_lock *lov = scan->lll_super;
+ struct cl_lock *parent = lov->lls_cl.cls_lock;
+
+ if (sub->lss_active != parent) {
+ lovsub_parent_lock(env, lov);
+ cl_lock_signal(env, parent);
+ lovsub_parent_unlock(env, lov);
+ }
+ }
+}
+
+/**
+ * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by
+ * asking parent lock.
+ */
+static unsigned long lovsub_lock_weigh(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct lovsub_lock *lock = cl2lovsub_lock(slice);
+ struct lov_lock *lov;
+ unsigned long dumbbell;
+
+ LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+ if (!list_empty(&lock->lss_parents)) {
+ /*
+ * It is not clear whether all parents have to be asked and
+ * their estimations summed, or it is enough to ask one. For
+ * the current usages, one is always enough.
+ */
+ lov = container_of(lock->lss_parents.next,
+ struct lov_lock_link, lll_list)->lll_super;
+
+ lovsub_parent_lock(env, lov);
+ dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock);
+ lovsub_parent_unlock(env, lov);
+ } else
+ dumbbell = 0;
+
+ return dumbbell;
+}
+
+/**
+ * Maps start/end offsets within a stripe, to offsets within a file.
+ */
+static void lovsub_lock_descr_map(const struct cl_lock_descr *in,
+ struct lov_object *lov,
+ int stripe, struct cl_lock_descr *out)
+{
+ pgoff_t size; /* stripe size in pages */
+ pgoff_t skip; /* how many pages in every stripe are occupied by
+ * "other" stripes */
+ pgoff_t start;
+ pgoff_t end;
+
+ start = in->cld_start;
+ end = in->cld_end;
+
+ if (lov->lo_lsm->lsm_stripe_count > 1) {
+ size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size);
+ skip = (lov->lo_lsm->lsm_stripe_count - 1) * size;
+
+ /* XXX overflow check here? */
+ start += start/size * skip + stripe * size;
+
+ if (end != CL_PAGE_EOF) {
+ end += end/size * skip + stripe * size;
+ /*
+ * And check for overflow...
+ */
+ if (end < in->cld_end)
+ end = CL_PAGE_EOF;
+ }
+ }
+ out->cld_start = start;
+ out->cld_end = end;
+}
+
+/**
+ * Adjusts parent lock extent when a sub-lock is attached to a parent. This is
+ * called in two ways:
+ *
+ * - as part of receive call-back, when server returns granted extent to
+ * the client, and
+ *
+ * - when top-lock finds existing sub-lock in the cache.
+ *
+ * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ
+ * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ.
+ */
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+ struct lovsub_lock *sublock,
+ const struct cl_lock_descr *d, int idx)
+{
+ struct cl_lock *parent;
+ struct lovsub_object *subobj;
+ struct cl_lock_descr *pd;
+ struct cl_lock_descr *parent_descr;
+ int result;
+
+ parent = lov->lls_cl.cls_lock;
+ parent_descr = &parent->cll_descr;
+ LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode));
+
+ subobj = cl2lovsub(sublock->lss_cl.cls_obj);
+ pd = &lov_env_info(env)->lti_ldescr;
+
+ pd->cld_obj = parent_descr->cld_obj;
+ pd->cld_mode = parent_descr->cld_mode;
+ pd->cld_gid = parent_descr->cld_gid;
+ lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
+ lov->lls_sub[idx].sub_got = *d;
+ /*
+ * Notify top-lock about modification, if lock description changes
+ * materially.
+ */
+ if (!cl_lock_ext_match(parent_descr, pd))
+ result = cl_lock_modify(env, parent, pd);
+ else
+ result = 0;
+ return result;
+}
+
+static int lovsub_lock_modify(const struct lu_env *env,
+ const struct cl_lock_slice *s,
+ const struct cl_lock_descr *d)
+{
+ struct lovsub_lock *lock = cl2lovsub_lock(s);
+ struct lov_lock_link *scan;
+ struct lov_lock *lov;
+ int result = 0;
+
+ LASSERT(cl_lock_mode_match(d->cld_mode,
+ s->cls_lock->cll_descr.cld_mode));
+ list_for_each_entry(scan, &lock->lss_parents, lll_list) {
+ int rc;
+
+ lov = scan->lll_super;
+ lovsub_parent_lock(env, lov);
+ rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx);
+ lovsub_parent_unlock(env, lov);
+ result = result ?: rc;
+ }
+ return result;
+}
+
+static int lovsub_lock_closure(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_lock_closure *closure)
+{
+ struct lovsub_lock *sub;
+ struct cl_lock *parent;
+ struct lov_lock_link *scan;
+ int result;
+
+ LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+ sub = cl2lovsub_lock(slice);
+ result = 0;
+
+ list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+ parent = scan->lll_super->lls_cl.cls_lock;
+ result = cl_lock_closure_build(env, parent, closure);
+ if (result != 0)
+ break;
+ }
+ return result;
+}
+
+/**
+ * A helper function for lovsub_lock_delete() that deals with a given parent
+ * top-lock.
+ */
+static int lovsub_lock_delete_one(const struct lu_env *env,
+ struct cl_lock *child, struct lov_lock *lov)
+{
+ struct cl_lock *parent;
+ int result;
+
+ parent = lov->lls_cl.cls_lock;
+ if (parent->cll_error)
+ return 0;
+
+ result = 0;
+ switch (parent->cll_state) {
+ case CLS_ENQUEUED:
+ /* See LU-1355 for the case that a glimpse lock is
+ * interrupted by signal */
+ LASSERT(parent->cll_flags & CLF_CANCELLED);
+ break;
+ case CLS_QUEUING:
+ case CLS_FREEING:
+ cl_lock_signal(env, parent);
+ break;
+ case CLS_INTRANSIT:
+ /*
+ * Here lies a problem: a sub-lock is canceled while top-lock
+ * is being unlocked. Top-lock cannot be moved into CLS_NEW
+ * state, because unlocking has to succeed eventually by
+ * placing lock into CLS_CACHED (or failing it), see
+ * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED
+ * state, because lov maintains an invariant that all
+ * sub-locks exist in CLS_CACHED (this allows cached top-lock
+ * to be reused immediately). Nor can we wait for top-lock
+ * state to change, because this can be synchronous to the
+ * current thread.
+ *
+ * We know for sure that lov_lock_unuse() will be called at
+ * least one more time to finish un-using, so leave a mark on
+ * the top-lock, that will be seen by the next call to
+ * lov_lock_unuse().
+ */
+ if (cl_lock_is_intransit(parent))
+ lov->lls_cancel_race = 1;
+ break;
+ case CLS_CACHED:
+ /*
+ * if a sub-lock is canceled move its top-lock into CLS_NEW
+ * state to preserve an invariant that a top-lock in
+ * CLS_CACHED is immediately ready for re-use (i.e., has all
+ * sub-locks), and so that next attempt to re-use the top-lock
+ * enqueues missing sub-lock.
+ */
+ cl_lock_state_set(env, parent, CLS_NEW);
+ /* fall through */
+ case CLS_NEW:
+ /*
+ * if last sub-lock is canceled, destroy the top-lock (which
+ * is now `empty') proactively.
+ */
+ if (lov->lls_nr_filled == 0) {
+ /* ... but unfortunately, this cannot be done easily,
+ * as cancellation of a top-lock might acquire mutices
+ * of its other sub-locks, violating lock ordering,
+ * see cl_lock_{cancel,delete}() preconditions.
+ *
+ * To work around this, the mutex of this sub-lock is
+ * released, top-lock is destroyed, and sub-lock mutex
+ * acquired again. The list of parents has to be
+ * re-scanned from the beginning after this.
+ *
+ * Only do this if no mutices other than on @child and
+ * @parent are held by the current thread.
+ *
+ * TODO: The lock modal here is too complex, because
+ * the lock may be canceled and deleted by voluntarily:
+ * cl_lock_request
+ * -> osc_lock_enqueue_wait
+ * -> osc_lock_cancel_wait
+ * -> cl_lock_delete
+ * -> lovsub_lock_delete
+ * -> cl_lock_cancel/delete
+ * -> ...
+ *
+ * The better choice is to spawn a kernel thread for
+ * this purpose. -jay
+ */
+ if (cl_lock_nr_mutexed(env) == 2) {
+ cl_lock_mutex_put(env, child);
+ cl_lock_cancel(env, parent);
+ cl_lock_delete(env, parent);
+ result = 1;
+ }
+ }
+ break;
+ case CLS_HELD:
+ CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n");
+ default:
+ CERROR("Impossible state: %d\n", parent->cll_state);
+ LBUG();
+ break;
+ }
+
+ return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked in "bottom-to-top" delete, when lock destruction starts from the
+ * sub-lock (e.g, as a result of ldlm lock LRU policy).
+ */
+static void lovsub_lock_delete(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct cl_lock *child = slice->cls_lock;
+ struct lovsub_lock *sub = cl2lovsub_lock(slice);
+ int restart;
+
+ LASSERT(cl_lock_is_mutexed(child));
+
+ /*
+ * Destruction of a sub-lock might take multiple iterations, because
+ * when the last sub-lock of a given top-lock is deleted, top-lock is
+ * canceled proactively, and this requires to release sub-lock
+ * mutex. Once sub-lock mutex has been released, list of its parents
+ * has to be re-scanned from the beginning.
+ */
+ do {
+ struct lov_lock *lov;
+ struct lov_lock_link *scan;
+ struct lov_lock_link *temp;
+ struct lov_lock_sub *subdata;
+
+ restart = 0;
+ list_for_each_entry_safe(scan, temp,
+ &sub->lss_parents, lll_list) {
+ lov = scan->lll_super;
+ subdata = &lov->lls_sub[scan->lll_idx];
+ lovsub_parent_lock(env, lov);
+ subdata->sub_got = subdata->sub_descr;
+ lov_lock_unlink(env, scan, sub);
+ restart = lovsub_lock_delete_one(env, child, lov);
+ lovsub_parent_unlock(env, lov);
+
+ if (restart) {
+ cl_lock_mutex_get(env, child);
+ break;
+ }
+ }
+ } while (restart);
+}
+
+static int lovsub_lock_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct cl_lock_slice *slice)
+{
+ struct lovsub_lock *sub = cl2lovsub_lock(slice);
+ struct lov_lock *lov;
+ struct lov_lock_link *scan;
+
+ list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+ lov = scan->lll_super;
+ (*p)(env, cookie, "[%d %p ", scan->lll_idx, lov);
+ if (lov != NULL)
+ cl_lock_descr_print(env, cookie, p,
+ &lov->lls_cl.cls_lock->cll_descr);
+ (*p)(env, cookie, "] ");
+ }
+ return 0;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+ .clo_fini = lovsub_lock_fini,
+ .clo_state = lovsub_lock_state,
+ .clo_delete = lovsub_lock_delete,
+ .clo_modify = lovsub_lock_modify,
+ .clo_closure = lovsub_lock_closure,
+ .clo_weigh = lovsub_lock_weigh,
+ .clo_print = lovsub_lock_print
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_lock *lock, const struct cl_io *io)
+{
+ struct lovsub_lock *lsk;
+ int result;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, GFP_NOFS);
+ if (lsk != NULL) {
+ INIT_LIST_HEAD(&lsk->lss_parents);
+ cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c
new file mode 100644
index 000000000..57e3629fc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_object.c
@@ -0,0 +1,164 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf)
+{
+ struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev);
+ struct lu_object *below;
+ struct lu_device *under;
+
+ int result;
+
+ under = &dev->acid_next->cd_lu_dev;
+ below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+ if (below != NULL) {
+ lu_object_add(obj, below);
+ cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+ struct lovsub_object *los = lu2lovsub(obj);
+ struct lov_object *lov = los->lso_super;
+
+ /* We can't assume lov was assigned here, because of the shadow
+ * object handling in lu_object_find.
+ */
+ if (lov) {
+ LASSERT(lov->lo_type == LLT_RAID0);
+ LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los);
+ spin_lock(&lov->u.raid0.lo_sub_lock);
+ lov->u.raid0.lo_sub[los->lso_index] = NULL;
+ spin_unlock(&lov->u.raid0.lo_sub_lock);
+ }
+
+ lu_object_fini(obj);
+ lu_object_header_fini(&los->lso_header.coh_lu);
+ OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *obj)
+{
+ struct lovsub_object *los = lu2lovsub(obj);
+
+ return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid)
+{
+ struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+ lov_r0(lov)->lo_attr_valid = 0;
+ return 0;
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+ const struct cl_object *obj,
+ struct ost_lvb *lvb)
+{
+ struct lovsub_object *los = cl2lovsub(obj);
+
+ return cl_object_glimpse(env, &los->lso_super->lo_cl, lvb);
+}
+
+
+
+static const struct cl_object_operations lovsub_ops = {
+ .coo_page_init = lovsub_page_init,
+ .coo_lock_init = lovsub_lock_init,
+ .coo_attr_set = lovsub_attr_set,
+ .coo_glimpse = lovsub_object_glimpse
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+ .loo_object_init = lovsub_object_init,
+ .loo_object_delete = NULL,
+ .loo_object_release = NULL,
+ .loo_object_free = lovsub_object_free,
+ .loo_object_print = lovsub_object_print,
+ .loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *unused,
+ struct lu_device *dev)
+{
+ struct lovsub_object *los;
+ struct lu_object *obj;
+
+ OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, GFP_NOFS);
+ if (los != NULL) {
+ struct cl_object_header *hdr;
+
+ obj = lovsub2lu(los);
+ hdr = &los->lso_header;
+ cl_object_header_init(hdr);
+ lu_object_init(obj, &hdr->coh_lu, dev);
+ lu_object_add_top(&hdr->coh_lu, obj);
+ los->lso_cl.co_ops = &lovsub_ops;
+ obj->lo_ops = &lovsub_lu_obj_ops;
+ } else
+ obj = NULL;
+ return obj;
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_page.c b/drivers/staging/lustre/lustre/lov/lovsub_page.c
new file mode 100644
index 000000000..3f00ce967
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lovsub_page.c
@@ -0,0 +1,71 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+ struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+ .cpo_fini = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *unused)
+{
+ struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+
+ cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops);
+ return 0;
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lproc_lov.c b/drivers/staging/lustre/lustre/lov/lproc_lov.c
new file mode 100644
index 000000000..174cbf5c1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/lov/lproc_lov.c
@@ -0,0 +1,311 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include "../include/lprocfs_status.h"
+#include "../include/obd_class.h"
+#include <linux/seq_file.h>
+#include "lov_internal.h"
+
+static int lov_stripesize_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lov_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ seq_printf(m, "%llu\n", desc->ld_default_stripe_size);
+ return 0;
+}
+
+static ssize_t lov_stripesize_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct lov_desc *desc;
+ __u64 val;
+ int rc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ rc = lprocfs_write_u64_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ lov_fix_desc_stripe_size(&val);
+ desc->ld_default_stripe_size = val;
+ return count;
+}
+LPROC_SEQ_FOPS(lov_stripesize);
+
+static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lov_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ seq_printf(m, "%llu\n", desc->ld_default_stripe_offset);
+ return 0;
+}
+
+static ssize_t lov_stripeoffset_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct lov_desc *desc;
+ __u64 val;
+ int rc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ rc = lprocfs_write_u64_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ desc->ld_default_stripe_offset = val;
+ return count;
+}
+LPROC_SEQ_FOPS(lov_stripeoffset);
+
+static int lov_stripetype_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lov_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ seq_printf(m, "%u\n", desc->ld_pattern);
+ return 0;
+}
+
+static ssize_t lov_stripetype_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct lov_desc *desc;
+ int val, rc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ lov_fix_desc_pattern(&val);
+ desc->ld_pattern = val;
+ return count;
+}
+LPROC_SEQ_FOPS(lov_stripetype);
+
+static int lov_stripecount_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lov_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ seq_printf(m, "%d\n", (__s16)(desc->ld_default_stripe_count + 1) - 1);
+ return 0;
+}
+
+static ssize_t lov_stripecount_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct lov_desc *desc;
+ int val, rc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ lov_fix_desc_stripe_count(&val);
+ desc->ld_default_stripe_count = val;
+ return count;
+}
+LPROC_SEQ_FOPS(lov_stripecount);
+
+static int lov_numobd_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lov_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ seq_printf(m, "%u\n", desc->ld_tgt_count);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_numobd);
+
+static int lov_activeobd_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lov_desc *desc;
+
+ LASSERT(dev != NULL);
+ desc = &dev->u.lov.desc;
+ seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_activeobd);
+
+static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = (struct obd_device *)m->private;
+ struct lov_obd *lov;
+
+ LASSERT(dev != NULL);
+ lov = &dev->u.lov;
+ seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(lov_desc_uuid);
+
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+ struct obd_device *dev = p->private;
+ struct lov_obd *lov = &dev->u.lov;
+
+ while (*pos < lov->desc.ld_tgt_count) {
+ if (lov->lov_tgts[*pos])
+ return lov->lov_tgts[*pos];
+ ++*pos;
+ }
+ return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct obd_device *dev = p->private;
+ struct lov_obd *lov = &dev->u.lov;
+
+ while (++*pos < lov->desc.ld_tgt_count) {
+ if (lov->lov_tgts[*pos])
+ return lov->lov_tgts[*pos];
+ }
+ return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+ struct lov_tgt_desc *tgt = v;
+
+ seq_printf(p, "%d: %s %sACTIVE\n",
+ tgt->ltd_index, obd_uuid2str(&tgt->ltd_uuid),
+ tgt->ltd_active ? "" : "IN");
+ return 0;
+}
+
+static const struct seq_operations lov_tgt_sops = {
+ .start = lov_tgt_seq_start,
+ .stop = lov_tgt_seq_stop,
+ .next = lov_tgt_seq_next,
+ .show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc;
+
+ rc = seq_open(file, &lov_tgt_sops);
+ if (rc)
+ return rc;
+
+ seq = file->private_data;
+ seq->private = PDE_DATA(inode);
+ return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
+
+static struct lprocfs_vars lprocfs_lov_obd_vars[] = {
+ { "uuid", &lov_uuid_fops, NULL, 0 },
+ { "stripesize", &lov_stripesize_fops, NULL },
+ { "stripeoffset", &lov_stripeoffset_fops, NULL },
+ { "stripecount", &lov_stripecount_fops, NULL },
+ { "stripetype", &lov_stripetype_fops, NULL },
+ { "numobd", &lov_numobd_fops, NULL, 0 },
+ { "activeobd", &lov_activeobd_fops, NULL, 0 },
+ { "filestotal", &lov_filestotal_fops, NULL, 0 },
+ { "filesfree", &lov_filesfree_fops, NULL, 0 },
+ /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/
+ { "blocksize", &lov_blksize_fops, NULL, 0 },
+ { "kbytestotal", &lov_kbytestotal_fops, NULL, 0 },
+ { "kbytesfree", &lov_kbytesfree_fops, NULL, 0 },
+ { "kbytesavail", &lov_kbytesavail_fops, NULL, 0 },
+ { "desc_uuid", &lov_desc_uuid_fops, NULL, 0 },
+ { NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, numrefs);
+
+static struct lprocfs_vars lprocfs_lov_module_vars[] = {
+ { "num_refs", &lov_numrefs_fops, NULL, 0 },
+ { NULL }
+};
+
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = lprocfs_lov_module_vars;
+ lvars->obd_vars = lprocfs_lov_obd_vars;
+}
+
+const struct file_operations lov_proc_target_fops = {
+ .owner = THIS_MODULE,
+ .open = lov_target_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = lprocfs_seq_release,
+};
diff --git a/drivers/staging/lustre/lustre/mdc/Makefile b/drivers/staging/lustre/lustre/mdc/Makefile
new file mode 100644
index 000000000..2516551a6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += mdc.o
+mdc-y := mdc_request.o mdc_reint.o mdc_lib.o mdc_locks.o
+mdc-$(CONFIG_PROC_FS) += lproc_mdc.o
diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
new file mode 100644
index 000000000..acfe08e45
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
@@ -0,0 +1,220 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/vfs.h>
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "mdc_internal.h"
+
+static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+ struct client_obd *cli = &dev->u.cli;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return 0;
+}
+
+static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count,
+ loff_t *off)
+{
+ struct obd_device *dev =
+ ((struct seq_file *)file->private_data)->private;
+ struct client_obd *cli = &dev->u.cli;
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val < 1 || val > MDC_MAX_RIF_MAX)
+ return -ERANGE;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_max_rpcs_in_flight = val;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
+
+static int mdc_kuc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, PDE_DATA(inode));
+}
+
+/* temporary for testing */
+static ssize_t mdc_kuc_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd =
+ ((struct seq_file *)file->private_data)->private;
+ struct kuc_hdr *lh;
+ struct hsm_action_list *hal;
+ struct hsm_action_item *hai;
+ int len;
+ int fd, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &fd);
+ if (rc)
+ return rc;
+
+ if (fd < 0)
+ return -ERANGE;
+ CWARN("message to fd %d\n", fd);
+
+ len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN +
+ /* for mockup below */ 2 * cfs_size_round(sizeof(*hai));
+
+ OBD_ALLOC(lh, len);
+ if (!lh)
+ return -ENOMEM;
+
+ lh->kuc_magic = KUC_MAGIC;
+ lh->kuc_transport = KUC_TRANSPORT_HSM;
+ lh->kuc_msgtype = HMT_ACTION_LIST;
+ lh->kuc_msglen = len;
+
+ hal = (struct hsm_action_list *)(lh + 1);
+ hal->hal_version = HAL_VERSION;
+ hal->hal_archive_id = 1;
+ hal->hal_flags = 0;
+ obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN);
+
+ /* mock up an action list */
+ hal->hal_count = 2;
+ hai = hai_zero(hal);
+ hai->hai_action = HSMA_ARCHIVE;
+ hai->hai_fid.f_oid = 5;
+ hai->hai_len = sizeof(*hai);
+ hai = hai_next(hai);
+ hai->hai_action = HSMA_RESTORE;
+ hai->hai_fid.f_oid = 10;
+ hai->hai_len = sizeof(*hai);
+
+ /* This works for either broadcast or unicast to a single fd */
+ if (fd == 0) {
+ rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+ } else {
+ struct file *fp = fget(fd);
+
+ rc = libcfs_kkuc_msg_put(fp, lh);
+ fput(fp);
+ }
+ OBD_FREE(lh, len);
+ if (rc < 0)
+ return rc;
+ return count;
+}
+
+struct file_operations mdc_kuc_fops = {
+ .open = mdc_kuc_open,
+ .write = mdc_kuc_write,
+ .release = single_release,
+};
+
+LPROC_SEQ_FOPS_WR_ONLY(mdc, ping);
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
+
+static int mdc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+ return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mdc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+ { "uuid", &mdc_uuid_fops, NULL, 0 },
+ { "ping", &mdc_ping_fops, NULL, 0222 },
+ { "connect_flags", &mdc_connect_flags_fops, NULL, 0 },
+ { "blocksize", &mdc_blksize_fops, NULL, 0 },
+ { "kbytestotal", &mdc_kbytestotal_fops, NULL, 0 },
+ { "kbytesfree", &mdc_kbytesfree_fops, NULL, 0 },
+ { "kbytesavail", &mdc_kbytesavail_fops, NULL, 0 },
+ { "filestotal", &mdc_filestotal_fops, NULL, 0 },
+ { "filesfree", &mdc_filesfree_fops, NULL, 0 },
+ /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/
+ { "mds_server_uuid", &mdc_server_uuid_fops, NULL, 0 },
+ { "mds_conn_uuid", &mdc_conn_uuid_fops, NULL, 0 },
+ /*
+ * FIXME: below proc entry is provided, but not in used, instead
+ * sbi->sb_md_brw_size is used, the per obd variable should be used
+ * when CMD is enabled, and dir pages are managed in MDC layer.
+ * Remember to enable proc write function.
+ */
+ { "max_pages_per_rpc", &mdc_obd_max_pages_per_rpc_fops, NULL, 0 },
+ { "max_rpcs_in_flight", &mdc_max_rpcs_in_flight_fops, NULL, 0 },
+ { "timeouts", &mdc_timeouts_fops, NULL, 0 },
+ { "import", &mdc_import_fops, NULL, 0 },
+ { "state", &mdc_state_fops, NULL, 0 },
+ { "hsm_nl", &mdc_kuc_fops, NULL, 0200 },
+ { "pinger_recov", &mdc_pinger_recov_fops, NULL, 0 },
+ { NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, numrefs);
+
+static struct lprocfs_vars lprocfs_mdc_module_vars[] = {
+ { "num_refs", &mdc_numrefs_fops, NULL, 0 },
+ { NULL }
+};
+
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = lprocfs_mdc_module_vars;
+ lvars->obd_vars = lprocfs_mdc_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h
new file mode 100644
index 000000000..81780c943
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_internal.h
@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include "../include/lustre_mdc.h"
+#include "../include/lustre_mds.h"
+
+#if defined CONFIG_PROC_FS
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+ struct obd_capa *oc, __u64 valid, int ea_size,
+ __u32 suppgid, int flags);
+void mdc_pack_capa(struct ptlrpc_request *req,
+ const struct req_msg_field *field, struct obd_capa *oc);
+int mdc_pack_req(struct ptlrpc_request *req, int version, int opc);
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+ const struct lu_fid *cfid, int flags);
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+ struct md_op_data *op_data);
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size,
+ const struct lu_fid *fid, struct obd_capa *oc);
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+ struct md_op_data *data, int ea_size);
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ void *ea, int ealen, void *ea2, int ea2len);
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ const void *data, int datalen, __u32 mode, __u32 uid,
+ __u32 gid, cfs_cap_t capability, __u64 rdev);
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ __u32 mode, __u64 rdev, __u64 flags, const void *data,
+ int datalen);
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_getxattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ const char *old, int oldlen, const char *new, int newlen);
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+int mdc_enter_request(struct client_obd *cli);
+void mdc_exit_request(struct client_obd *cli);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+ __u64 *lockh, void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+ ldlm_iterator_t it, void *data);
+
+int mdc_intent_lock(struct obd_export *exp,
+ struct md_op_data *,
+ void *lmm, int lmmsize,
+ struct lookup_intent *, int,
+ struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags);
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+ struct lookup_intent *it, struct md_op_data *op_data,
+ struct lustre_handle *lockh, void *lmm, int lmmsize,
+ struct ptlrpc_request **req, __u64 extra_lock_flags);
+
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+ struct list_head *cancels, ldlm_mode_t mode,
+ __u64 bits);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+ struct md_op_data *op_data);
+
+int mdc_open(struct obd_export *exp, u64 ino, int type, int flags,
+ struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
+ struct ptlrpc_request **);
+
+struct obd_client_handle;
+
+int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
+ struct obd_export *dt_exp, struct obd_export *lmv_exp,
+ struct lustre_md *md);
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och,
+ struct lookup_intent *it);
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och);
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+ const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+ cfs_cap_t capability, __u64 rdev,
+ struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+ const char *old, int oldlen, const char *new, int newlen,
+ struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+ void *ea, int ealen, void *ea2, int ea2len,
+ struct ptlrpc_request **request, struct md_open_data **mod);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags, void *opaque);
+
+static inline void mdc_set_capa_size(struct ptlrpc_request *req,
+ const struct req_msg_field *field,
+ struct obd_capa *oc)
+{
+ if (oc == NULL)
+ req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+ else
+ /* it is already calculated as sizeof struct obd_capa */
+ ;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+ struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+ struct md_enqueue_info *minfo,
+ struct ldlm_enqueue_info *einfo);
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+ const struct lu_fid *fid, ldlm_type_t type,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode,
+ struct lustre_handle *lockh);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+ struct ptlrpc_request *req, int opc,
+ struct list_head *cancels, int count)
+{
+ return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+ count);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c
new file mode 100644
index 000000000..d3234cb1e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_lib.c
@@ -0,0 +1,593 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include "../include/lustre_net.h"
+#include "../include/lustre/lustre_idl.h"
+#include "mdc_internal.h"
+
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+ LASSERT(b != NULL);
+
+ b->suppgid = suppgid;
+ b->uid = from_kuid(&init_user_ns, current_uid());
+ b->gid = from_kgid(&init_user_ns, current_gid());
+ b->fsuid = from_kuid(&init_user_ns, current_fsuid());
+ b->fsgid = from_kgid(&init_user_ns, current_fsgid());
+ b->capability = cfs_curproc_cap_pack();
+}
+
+void mdc_pack_capa(struct ptlrpc_request *req,
+ const struct req_msg_field *field,
+ struct obd_capa *oc)
+{
+ struct req_capsule *pill = &req->rq_pill;
+ struct lustre_capa *c;
+
+ if (oc == NULL) {
+ LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0);
+ return;
+ }
+
+ c = req_capsule_client_get(pill, field);
+ LASSERT(c != NULL);
+ capa_cpy(c, oc);
+ DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+ const struct lu_fid *cfid, int flags)
+{
+ struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+ &RMF_MDT_BODY);
+
+ if (pfid) {
+ b->fid1 = *pfid;
+ b->valid = OBD_MD_FLID;
+ }
+ if (cfid)
+ b->fid2 = *cfid;
+ b->flags = flags;
+}
+
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+ struct md_op_data *op_data)
+{
+ struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+ &RMF_MDT_BODY);
+
+ __mdc_pack_body(b, op_data->op_suppgids[0]);
+ b->fid1 = op_data->op_fid1;
+ b->fid2 = op_data->op_fid2;
+ b->valid |= OBD_MD_FLID;
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+ mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+}
+
+void mdc_pack_body(struct ptlrpc_request *req,
+ const struct lu_fid *fid, struct obd_capa *oc,
+ __u64 valid, int ea_size, __u32 suppgid, int flags)
+{
+ struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+ &RMF_MDT_BODY);
+ LASSERT(b != NULL);
+ b->valid = valid;
+ b->eadatasize = ea_size;
+ b->flags = flags;
+ __mdc_pack_body(b, suppgid);
+ if (fid) {
+ b->fid1 = *fid;
+ b->valid |= OBD_MD_FLID;
+ mdc_pack_capa(req, &RMF_CAPA1, oc);
+ }
+}
+
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff,
+ __u32 size, const struct lu_fid *fid, struct obd_capa *oc)
+{
+ struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+ &RMF_MDT_BODY);
+ b->fid1 = *fid;
+ b->valid |= OBD_MD_FLID;
+ b->size = pgoff; /* !! */
+ b->nlink = size; /* !! */
+ __mdc_pack_body(b, -1);
+ b->mode = LUDA_FID | LUDA_TYPE;
+
+ mdc_pack_capa(req, &RMF_CAPA1, oc);
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ const void *data, int datalen, __u32 mode,
+ __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev)
+{
+ struct mdt_rec_create *rec;
+ char *tmp;
+ __u64 flags;
+
+ CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+
+ rec->cr_opcode = REINT_CREATE;
+ rec->cr_fsuid = uid;
+ rec->cr_fsgid = gid;
+ rec->cr_cap = cap_effective;
+ rec->cr_fid1 = op_data->op_fid1;
+ rec->cr_fid2 = op_data->op_fid2;
+ rec->cr_mode = mode;
+ rec->cr_rdev = rdev;
+ rec->cr_time = op_data->op_mod_time;
+ rec->cr_suppgid1 = op_data->op_suppgids[0];
+ rec->cr_suppgid2 = op_data->op_suppgids[1];
+ flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+ if (op_data->op_bias & MDS_CREATE_VOLATILE)
+ flags |= MDS_OPEN_VOLATILE;
+ set_mrc_cr_flags(rec, flags);
+ rec->cr_bias = op_data->op_bias;
+ rec->cr_umask = current_umask();
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+ LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+ if (data) {
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+ memcpy(tmp, data, datalen);
+ }
+}
+
+static __u64 mds_pack_open_flags(__u64 flags, __u32 mode)
+{
+ __u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+ MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |
+ MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |
+ MDS_OPEN_BY_FID | MDS_OPEN_LEASE |
+ MDS_OPEN_RELEASE));
+ if (flags & O_CREAT)
+ cr_flags |= MDS_OPEN_CREAT;
+ if (flags & O_EXCL)
+ cr_flags |= MDS_OPEN_EXCL;
+ if (flags & O_TRUNC)
+ cr_flags |= MDS_OPEN_TRUNC;
+ if (flags & O_APPEND)
+ cr_flags |= MDS_OPEN_APPEND;
+ if (flags & O_SYNC)
+ cr_flags |= MDS_OPEN_SYNC;
+ if (flags & O_DIRECTORY)
+ cr_flags |= MDS_OPEN_DIRECTORY;
+ if (flags & __FMODE_EXEC)
+ cr_flags |= MDS_FMODE_EXEC;
+ if (cl_is_lov_delay_create(flags))
+ cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+ if (flags & O_NONBLOCK)
+ cr_flags |= MDS_OPEN_NORESTORE;
+
+ return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ __u32 mode, __u64 rdev, __u64 flags, const void *lmm,
+ int lmmlen)
+{
+ struct mdt_rec_create *rec;
+ char *tmp;
+ __u64 cr_flags;
+
+ CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+ /* XXX do something about time, uid, gid */
+ rec->cr_opcode = REINT_OPEN;
+ rec->cr_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ rec->cr_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ rec->cr_cap = cfs_curproc_cap_pack();
+ rec->cr_fid1 = op_data->op_fid1;
+ rec->cr_fid2 = op_data->op_fid2;
+
+ rec->cr_mode = mode;
+ cr_flags = mds_pack_open_flags(flags, mode);
+ rec->cr_rdev = rdev;
+ rec->cr_time = op_data->op_mod_time;
+ rec->cr_suppgid1 = op_data->op_suppgids[0];
+ rec->cr_suppgid2 = op_data->op_suppgids[1];
+ rec->cr_bias = op_data->op_bias;
+ rec->cr_umask = current_umask();
+ rec->cr_old_handle = op_data->op_handle;
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+ /* the next buffer is child capa, which is used for replay,
+ * will be packed from the data in reply message. */
+
+ if (op_data->op_name) {
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+ LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+ if (op_data->op_bias & MDS_CREATE_VOLATILE)
+ cr_flags |= MDS_OPEN_VOLATILE;
+ }
+
+ if (lmm) {
+ cr_flags |= MDS_OPEN_HAS_EA;
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+ memcpy(tmp, lmm, lmmlen);
+ }
+ set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline __u64 attr_pack(unsigned int ia_valid)
+{
+ __u64 sa_valid = 0;
+
+ if (ia_valid & ATTR_MODE)
+ sa_valid |= MDS_ATTR_MODE;
+ if (ia_valid & ATTR_UID)
+ sa_valid |= MDS_ATTR_UID;
+ if (ia_valid & ATTR_GID)
+ sa_valid |= MDS_ATTR_GID;
+ if (ia_valid & ATTR_SIZE)
+ sa_valid |= MDS_ATTR_SIZE;
+ if (ia_valid & ATTR_ATIME)
+ sa_valid |= MDS_ATTR_ATIME;
+ if (ia_valid & ATTR_MTIME)
+ sa_valid |= MDS_ATTR_MTIME;
+ if (ia_valid & ATTR_CTIME)
+ sa_valid |= MDS_ATTR_CTIME;
+ if (ia_valid & ATTR_ATIME_SET)
+ sa_valid |= MDS_ATTR_ATIME_SET;
+ if (ia_valid & ATTR_MTIME_SET)
+ sa_valid |= MDS_ATTR_MTIME_SET;
+ if (ia_valid & ATTR_FORCE)
+ sa_valid |= MDS_ATTR_FORCE;
+ if (ia_valid & ATTR_ATTR_FLAG)
+ sa_valid |= MDS_ATTR_ATTR_FLAG;
+ if (ia_valid & ATTR_KILL_SUID)
+ sa_valid |= MDS_ATTR_KILL_SUID;
+ if (ia_valid & ATTR_KILL_SGID)
+ sa_valid |= MDS_ATTR_KILL_SGID;
+ if (ia_valid & ATTR_CTIME_SET)
+ sa_valid |= MDS_ATTR_CTIME_SET;
+ if (ia_valid & ATTR_FROM_OPEN)
+ sa_valid |= MDS_ATTR_FROM_OPEN;
+ if (ia_valid & ATTR_BLOCKS)
+ sa_valid |= MDS_ATTR_BLOCKS;
+ if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+ /* NFSD hack (see bug 5781) */
+ sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+ return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+ struct md_op_data *op_data)
+{
+ rec->sa_opcode = REINT_SETATTR;
+ rec->sa_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ rec->sa_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ rec->sa_cap = cfs_curproc_cap_pack();
+ rec->sa_suppgid = -1;
+
+ rec->sa_fid = op_data->op_fid1;
+ rec->sa_valid = attr_pack(op_data->op_attr.ia_valid);
+ rec->sa_mode = op_data->op_attr.ia_mode;
+ rec->sa_uid = from_kuid(&init_user_ns, op_data->op_attr.ia_uid);
+ rec->sa_gid = from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+ rec->sa_size = op_data->op_attr.ia_size;
+ rec->sa_blocks = op_data->op_attr_blocks;
+ rec->sa_atime = LTIME_S(op_data->op_attr.ia_atime);
+ rec->sa_mtime = LTIME_S(op_data->op_attr.ia_mtime);
+ rec->sa_ctime = LTIME_S(op_data->op_attr.ia_ctime);
+ rec->sa_attr_flags =
+ ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+ if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+ in_group_p(op_data->op_attr.ia_gid))
+ rec->sa_suppgid =
+ from_kgid(&init_user_ns, op_data->op_attr.ia_gid);
+ else
+ rec->sa_suppgid = op_data->op_suppgids[0];
+
+ rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+ struct md_op_data *op_data)
+{
+ memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle));
+ epoch->ioepoch = op_data->op_ioepoch;
+ epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ void *ea, int ealen, void *ea2, int ea2len)
+{
+ struct mdt_rec_setattr *rec;
+ struct mdt_ioepoch *epoch;
+ struct lov_user_md *lum = NULL;
+
+ CLASSERT(sizeof(struct mdt_rec_reint) ==
+ sizeof(struct mdt_rec_setattr));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+ mdc_setattr_pack_rec(rec, op_data);
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+ if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) {
+ epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+ mdc_ioepoch_pack(epoch, op_data);
+ }
+
+ if (ealen == 0)
+ return;
+
+ lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+ if (ea == NULL) { /* Remove LOV EA */
+ lum->lmm_magic = LOV_USER_MAGIC_V1;
+ lum->lmm_stripe_size = 0;
+ lum->lmm_stripe_count = 0;
+ lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1);
+ } else {
+ memcpy(lum, ea, ealen);
+ }
+
+ if (ea2len == 0)
+ return;
+
+ memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2,
+ ea2len);
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+ struct mdt_rec_unlink *rec;
+ char *tmp;
+
+ CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+ LASSERT(rec != NULL);
+
+ rec->ul_opcode = op_data->op_cli_flags & CLI_RM_ENTRY ?
+ REINT_RMENTRY : REINT_UNLINK;
+ rec->ul_fsuid = op_data->op_fsuid;
+ rec->ul_fsgid = op_data->op_fsgid;
+ rec->ul_cap = op_data->op_cap;
+ rec->ul_mode = op_data->op_mode;
+ rec->ul_suppgid1 = op_data->op_suppgids[0];
+ rec->ul_suppgid2 = -1;
+ rec->ul_fid1 = op_data->op_fid1;
+ rec->ul_fid2 = op_data->op_fid2;
+ rec->ul_time = op_data->op_mod_time;
+ rec->ul_bias = op_data->op_bias;
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+ LASSERT(tmp != NULL);
+ LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+ struct mdt_rec_link *rec;
+ char *tmp;
+
+ CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+ LASSERT(rec != NULL);
+
+ rec->lk_opcode = REINT_LINK;
+ rec->lk_fsuid = op_data->op_fsuid; /* current->fsuid; */
+ rec->lk_fsgid = op_data->op_fsgid; /* current->fsgid; */
+ rec->lk_cap = op_data->op_cap; /* current->cap_effective; */
+ rec->lk_suppgid1 = op_data->op_suppgids[0];
+ rec->lk_suppgid2 = op_data->op_suppgids[1];
+ rec->lk_fid1 = op_data->op_fid1;
+ rec->lk_fid2 = op_data->op_fid2;
+ rec->lk_time = op_data->op_mod_time;
+ rec->lk_bias = op_data->op_bias;
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+ mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+ LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+ const char *old, int oldlen, const char *new, int newlen)
+{
+ struct mdt_rec_rename *rec;
+ char *tmp;
+
+ CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+ /* XXX do something about time, uid, gid */
+ rec->rn_opcode = REINT_RENAME;
+ rec->rn_fsuid = op_data->op_fsuid;
+ rec->rn_fsgid = op_data->op_fsgid;
+ rec->rn_cap = op_data->op_cap;
+ rec->rn_suppgid1 = op_data->op_suppgids[0];
+ rec->rn_suppgid2 = op_data->op_suppgids[1];
+ rec->rn_fid1 = op_data->op_fid1;
+ rec->rn_fid2 = op_data->op_fid2;
+ rec->rn_time = op_data->op_mod_time;
+ rec->rn_mode = op_data->op_mode;
+ rec->rn_bias = op_data->op_bias;
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+ mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+ LOGL0(old, oldlen, tmp);
+
+ if (new) {
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT);
+ LOGL0(new, newlen, tmp);
+ }
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+ struct md_op_data *op_data, int ea_size)
+{
+ struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+ &RMF_MDT_BODY);
+
+ b->valid = valid;
+ if (op_data->op_bias & MDS_CHECK_SPLIT)
+ b->valid |= OBD_MD_FLCKSPLIT;
+ if (op_data->op_bias & MDS_CROSS_REF)
+ b->valid |= OBD_MD_FLCROSSREF;
+ b->eadatasize = ea_size;
+ b->flags = flags;
+ __mdc_pack_body(b, op_data->op_suppgids[0]);
+
+ b->fid1 = op_data->op_fid1;
+ b->fid2 = op_data->op_fid2;
+ b->valid |= OBD_MD_FLID;
+
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+ if (op_data->op_name) {
+ char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+
+ LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+ }
+}
+
+static void mdc_hsm_release_pack(struct ptlrpc_request *req,
+ struct md_op_data *op_data)
+{
+ if (op_data->op_bias & MDS_HSM_RELEASE) {
+ struct close_data *data;
+ struct ldlm_lock *lock;
+
+ data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
+ LASSERT(data != NULL);
+
+ lock = ldlm_handle2lock(&op_data->op_lease_handle);
+ if (lock != NULL) {
+ data->cd_handle = lock->l_remote_handle;
+ ldlm_lock_put(lock);
+ }
+ ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+
+ data->cd_data_version = op_data->op_data_version;
+ data->cd_fid = op_data->op_fid2;
+ }
+}
+
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+ struct mdt_ioepoch *epoch;
+ struct mdt_rec_setattr *rec;
+
+ epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+ mdc_setattr_pack_rec(rec, op_data);
+ mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+ mdc_ioepoch_pack(epoch, op_data);
+ mdc_hsm_release_pack(req, op_data);
+}
+
+static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+ int rc;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ rc = list_empty(&mcw->mcw_entry);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ return rc;
+};
+
+/* We record requests in flight in cli->cl_r_in_flight here.
+ * There is only one write rpc possible in mdc anyway. If this to change
+ * in the future - the code may need to be revisited. */
+int mdc_enter_request(struct client_obd *cli)
+{
+ int rc = 0;
+ struct mdc_cache_waiter mcw;
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+ list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+ init_waitqueue_head(&mcw.mcw_waitq);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw),
+ &lwi);
+ if (rc) {
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ if (list_empty(&mcw.mcw_entry))
+ cli->cl_r_in_flight--;
+ list_del_init(&mcw.mcw_entry);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ }
+ } else {
+ cli->cl_r_in_flight++;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ }
+ return rc;
+}
+
+void mdc_exit_request(struct client_obd *cli)
+{
+ struct list_head *l, *tmp;
+ struct mdc_cache_waiter *mcw;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_r_in_flight--;
+ list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+ if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+ /* No free request slots anymore */
+ break;
+ }
+
+ mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+ list_del_init(&mcw->mcw_entry);
+ cli->cl_r_in_flight++;
+ wake_up(&mcw->mcw_waitq);
+ }
+ /* Empty waiting list? Decrease reqs in-flight number */
+
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c
new file mode 100644
index 000000000..d1c224ecd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_locks.c
@@ -0,0 +1,1313 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+
+#include "../include/lustre_intent.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_fid.h" /* fid_res_name_eq() */
+#include "../include/lustre_mdc.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_req_layout.h"
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+ struct obd_export *ga_exp;
+ struct md_enqueue_info *ga_minfo;
+ struct ldlm_enqueue_info *ga_einfo;
+};
+
+int it_disposition(struct lookup_intent *it, int flag)
+{
+ return it->d.lustre.it_disposition & flag;
+}
+EXPORT_SYMBOL(it_disposition);
+
+void it_set_disposition(struct lookup_intent *it, int flag)
+{
+ it->d.lustre.it_disposition |= flag;
+}
+EXPORT_SYMBOL(it_set_disposition);
+
+void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+ it->d.lustre.it_disposition &= ~flag;
+}
+EXPORT_SYMBOL(it_clear_disposition);
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+ if (it_disposition(it, DISP_OPEN_LEASE)) {
+ if (phase >= DISP_OPEN_LEASE)
+ return it->d.lustre.it_status;
+ else
+ return 0;
+ }
+ if (it_disposition(it, DISP_OPEN_OPEN)) {
+ if (phase >= DISP_OPEN_OPEN)
+ return it->d.lustre.it_status;
+ else
+ return 0;
+ }
+
+ if (it_disposition(it, DISP_OPEN_CREATE)) {
+ if (phase >= DISP_OPEN_CREATE)
+ return it->d.lustre.it_status;
+ else
+ return 0;
+ }
+
+ if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+ if (phase >= DISP_LOOKUP_EXECD)
+ return it->d.lustre.it_status;
+ else
+ return 0;
+ }
+
+ if (it_disposition(it, DISP_IT_EXECD)) {
+ if (phase >= DISP_IT_EXECD)
+ return it->d.lustre.it_status;
+ else
+ return 0;
+ }
+ CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
+ it->d.lustre.it_status);
+ LBUG();
+ return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+ __u64 *bits)
+{
+ struct ldlm_lock *lock;
+ struct inode *new_inode = data;
+
+ if (bits)
+ *bits = 0;
+
+ if (!*lockh)
+ return 0;
+
+ lock = ldlm_handle2lock((struct lustre_handle *)lockh);
+
+ LASSERT(lock != NULL);
+ lock_res_and_lock(lock);
+ if (lock->l_resource->lr_lvb_inode &&
+ lock->l_resource->lr_lvb_inode != data) {
+ struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+
+ LASSERTF(old_inode->i_state & I_FREEING,
+ "Found existing inode %p/%lu/%u state %lu in lock: setting data to %p/%lu/%u\n",
+ old_inode, old_inode->i_ino, old_inode->i_generation,
+ old_inode->i_state, new_inode, new_inode->i_ino,
+ new_inode->i_generation);
+ }
+ lock->l_resource->lr_lvb_inode = new_inode;
+ if (bits)
+ *bits = lock->l_policy_data.l_inodebits.bits;
+
+ unlock_res_and_lock(lock);
+ LDLM_LOCK_PUT(lock);
+
+ return 0;
+}
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+ const struct lu_fid *fid, ldlm_type_t type,
+ ldlm_policy_data_t *policy, ldlm_mode_t mode,
+ struct lustre_handle *lockh)
+{
+ struct ldlm_res_id res_id;
+ ldlm_mode_t rc;
+
+ fid_build_reg_res_name(fid, &res_id);
+ /* LU-4405: Clear bits not supported by server */
+ policy->l_inodebits.bits &= exp_connect_ibits(exp);
+ rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+ &res_id, type, policy, mode, lockh, 0);
+ return rc;
+}
+
+int mdc_cancel_unused(struct obd_export *exp,
+ const struct lu_fid *fid,
+ ldlm_policy_data_t *policy,
+ ldlm_mode_t mode,
+ ldlm_cancel_flags_t flags,
+ void *opaque)
+{
+ struct ldlm_res_id res_id;
+ struct obd_device *obd = class_exp2obd(exp);
+ int rc;
+
+ fid_build_reg_res_name(fid, &res_id);
+ rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+ policy, mode, flags, opaque);
+ return rc;
+}
+
+int mdc_null_inode(struct obd_export *exp,
+ const struct lu_fid *fid)
+{
+ struct ldlm_res_id res_id;
+ struct ldlm_resource *res;
+ struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+
+ LASSERTF(ns != NULL, "no namespace passed\n");
+
+ fid_build_reg_res_name(fid, &res_id);
+
+ res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+ if (res == NULL)
+ return 0;
+
+ lock_res(res);
+ res->lr_lvb_inode = NULL;
+ unlock_res(res);
+
+ ldlm_resource_putref(res);
+ return 0;
+}
+
+/* find any ldlm lock of the inode in mdc
+ * return 0 not find
+ * 1 find one
+ * < 0 error */
+int mdc_find_cbdata(struct obd_export *exp,
+ const struct lu_fid *fid,
+ ldlm_iterator_t it, void *data)
+{
+ struct ldlm_res_id res_id;
+ int rc = 0;
+
+ fid_build_reg_res_name((struct lu_fid *)fid, &res_id);
+ rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
+ it, data);
+ if (rc == LDLM_ITER_STOP)
+ return 1;
+ else if (rc == LDLM_ITER_CONTINUE)
+ return 0;
+ return rc;
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+ /* Don't hold error requests for replay. */
+ if (req->rq_replay) {
+ spin_lock(&req->rq_lock);
+ req->rq_replay = 0;
+ spin_unlock(&req->rq_lock);
+ }
+ if (rc && req->rq_transno != 0) {
+ DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+ LBUG();
+ }
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay. We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+static void mdc_realloc_openmsg(struct ptlrpc_request *req,
+ struct mdt_body *body)
+{
+ int rc;
+
+ /* FIXME: remove this explicit offset. */
+ rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4,
+ body->eadatasize);
+ if (rc) {
+ CERROR("Can't enlarge segment %d size to %d\n",
+ DLM_INTENT_REC_OFF + 4, body->eadatasize);
+ body->valid &= ~OBD_MD_FLEASIZE;
+ body->eadatasize = 0;
+ }
+}
+
+static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
+ struct lookup_intent *it,
+ struct md_op_data *op_data,
+ void *lmm, int lmmsize,
+ void *cb_data)
+{
+ struct ptlrpc_request *req;
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct ldlm_intent *lit;
+ LIST_HEAD(cancels);
+ int count = 0;
+ int mode;
+ int rc;
+
+ it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+ /* XXX: openlock is not cancelled for cross-refs. */
+ /* If inode is known, cancel conflicting OPEN locks. */
+ if (fid_is_sane(&op_data->op_fid2)) {
+ if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */
+ if (it->it_flags & FMODE_WRITE)
+ mode = LCK_EX;
+ else
+ mode = LCK_PR;
+ } else {
+ if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+ mode = LCK_CW;
+ else if (it->it_flags & __FMODE_EXEC)
+ mode = LCK_PR;
+ else
+ mode = LCK_CR;
+ }
+ count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+ &cancels, mode,
+ MDS_INODELOCK_OPEN);
+ }
+
+ /* If CREATE, cancel parent's UPDATE lock. */
+ if (it->it_op & IT_CREAT)
+ mode = LCK_EX;
+ else
+ mode = LCK_CR;
+ count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, mode,
+ MDS_INODELOCK_UPDATE);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_LDLM_INTENT_OPEN);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* parent capability */
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ /* child capability, reserve the size according to parent capa, it will
+ * be filled after we get the reply */
+ mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+ max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+
+ rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+ if (rc < 0) {
+ ptlrpc_request_free(req);
+ return ERR_PTR(rc);
+ }
+
+ spin_lock(&req->rq_lock);
+ req->rq_replay = req->rq_import->imp_replayable;
+ spin_unlock(&req->rq_lock);
+
+ /* pack the intent */
+ lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+ lit->opc = (__u64)it->it_op;
+
+ /* pack the intended request */
+ mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
+ lmmsize);
+
+ /* for remote client, fetch remote perm for current user */
+ if (client_is_remote(exp))
+ req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+ sizeof(struct mdt_remote_perm));
+ ptlrpc_request_set_replen(req);
+ return req;
+}
+
+static struct ptlrpc_request *
+mdc_intent_getxattr_pack(struct obd_export *exp,
+ struct lookup_intent *it,
+ struct md_op_data *op_data)
+{
+ struct ptlrpc_request *req;
+ struct ldlm_intent *lit;
+ int rc, count = 0, maxdata;
+ LIST_HEAD(cancels);
+
+
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_LDLM_INTENT_GETXATTR);
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+ rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return ERR_PTR(rc);
+ }
+
+ /* pack the intent */
+ lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+ lit->opc = IT_GETXATTR;
+
+ maxdata = class_exp2cliimp(exp)->imp_connect_data.ocd_max_easize;
+
+ /* pack the intended request */
+ mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+ op_data->op_valid, maxdata, -1, 0);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+ RCL_SERVER, maxdata);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_EAVALS,
+ RCL_SERVER, maxdata);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS,
+ RCL_SERVER, maxdata);
+
+ ptlrpc_request_set_replen(req);
+
+ return req;
+}
+
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+ struct lookup_intent *it,
+ struct md_op_data *op_data)
+{
+ struct ptlrpc_request *req;
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct ldlm_intent *lit;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_LDLM_INTENT_UNLINK);
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+
+ rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return ERR_PTR(rc);
+ }
+
+ /* pack the intent */
+ lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+ lit->opc = (__u64)it->it_op;
+
+ /* pack the intended request */
+ mdc_unlink_pack(req, op_data);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+ obddev->u.cli.cl_default_mds_easize);
+ req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+ obddev->u.cli.cl_default_mds_cookiesize);
+ ptlrpc_request_set_replen(req);
+ return req;
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+ struct lookup_intent *it,
+ struct md_op_data *op_data)
+{
+ struct ptlrpc_request *req;
+ struct obd_device *obddev = class_exp2obd(exp);
+ u64 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+ OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
+ OBD_MD_FLMDSCAPA | OBD_MD_MEA |
+ (client_is_remote(exp) ?
+ OBD_MD_FLRMTPERM : OBD_MD_FLACL);
+ struct ldlm_intent *lit;
+ int rc;
+ int easize;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_LDLM_INTENT_GETATTR);
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+
+ rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return ERR_PTR(rc);
+ }
+
+ /* pack the intent */
+ lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+ lit->opc = (__u64)it->it_op;
+
+ if (obddev->u.cli.cl_default_mds_easize > 0)
+ easize = obddev->u.cli.cl_default_mds_easize;
+ else
+ easize = obddev->u.cli.cl_max_mds_easize;
+
+ /* pack the intended request */
+ mdc_getattr_pack(req, valid, it->it_flags, op_data, easize);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize);
+ if (client_is_remote(exp))
+ req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+ sizeof(struct mdt_remote_perm));
+ ptlrpc_request_set_replen(req);
+ return req;
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+ struct lookup_intent *it,
+ struct md_op_data *unused)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct ptlrpc_request *req;
+ struct ldlm_intent *lit;
+ struct layout_intent *layout;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_LDLM_INTENT_LAYOUT);
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+ rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return ERR_PTR(rc);
+ }
+
+ /* pack the intent */
+ lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+ lit->opc = (__u64)it->it_op;
+
+ /* pack the layout intent request */
+ layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+ /* LAYOUT_INTENT_ACCESS is generic, specific operation will be
+ * set for replication */
+ layout->li_opc = LAYOUT_INTENT_ACCESS;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+ obd->u.cli.cl_default_mds_easize);
+ ptlrpc_request_set_replen(req);
+ return req;
+}
+
+static struct ptlrpc_request *
+mdc_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return ERR_PTR(rc);
+ }
+
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+ ptlrpc_request_set_replen(req);
+ return req;
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+ struct ptlrpc_request *req,
+ struct ldlm_enqueue_info *einfo,
+ struct lookup_intent *it,
+ struct lustre_handle *lockh,
+ int rc)
+{
+ struct req_capsule *pill = &req->rq_pill;
+ struct ldlm_request *lockreq;
+ struct ldlm_reply *lockrep;
+ struct lustre_intent_data *intent = &it->d.lustre;
+ struct ldlm_lock *lock;
+ void *lvb_data = NULL;
+ int lvb_len = 0;
+
+ LASSERT(rc >= 0);
+ /* Similarly, if we're going to replay this request, we don't want to
+ * actually get a lock, just perform the intent. */
+ if (req->rq_transno || req->rq_replay) {
+ lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+ lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+ }
+
+ if (rc == ELDLM_LOCK_ABORTED) {
+ einfo->ei_mode = 0;
+ memset(lockh, 0, sizeof(*lockh));
+ rc = 0;
+ } else { /* rc = 0 */
+ lock = ldlm_handle2lock(lockh);
+ LASSERT(lock != NULL);
+
+ /* If the server gave us back a different lock mode, we should
+ * fix up our variables. */
+ if (lock->l_req_mode != einfo->ei_mode) {
+ ldlm_lock_addref(lockh, lock->l_req_mode);
+ ldlm_lock_decref(lockh, einfo->ei_mode);
+ einfo->ei_mode = lock->l_req_mode;
+ }
+ LDLM_LOCK_PUT(lock);
+ }
+
+ lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+ LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+ intent->it_disposition = (int)lockrep->lock_policy_res1;
+ intent->it_status = (int)lockrep->lock_policy_res2;
+ intent->it_lock_mode = einfo->ei_mode;
+ intent->it_lock_handle = lockh->cookie;
+ intent->it_data = req;
+
+ /* Technically speaking rq_transno must already be zero if
+ * it_status is in error, so the check is a bit redundant */
+ if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay)
+ mdc_clear_replay_flag(req, intent->it_status);
+
+ /* If we're doing an IT_OPEN which did not result in an actual
+ * successful open, then we need to remove the bit which saves
+ * this request for unconditional replay.
+ *
+ * It's important that we do this first! Otherwise we might exit the
+ * function without doing so, and try to replay a failed create
+ * (bug 3440) */
+ if (it->it_op & IT_OPEN && req->rq_replay &&
+ (!it_disposition(it, DISP_OPEN_OPEN) || intent->it_status != 0))
+ mdc_clear_replay_flag(req, intent->it_status);
+
+ DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+ it->it_op, intent->it_disposition, intent->it_status);
+
+ /* We know what to expect, so we do any byte flipping required here */
+ if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) {
+ struct mdt_body *body;
+
+ body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+ if (body == NULL) {
+ CERROR("Can't swab mdt_body\n");
+ return -EPROTO;
+ }
+
+ if (it_disposition(it, DISP_OPEN_OPEN) &&
+ !it_open_error(DISP_OPEN_OPEN, it)) {
+ /*
+ * If this is a successful OPEN request, we need to set
+ * replay handler and data early, so that if replay
+ * happens immediately after swabbing below, new reply
+ * is swabbed by that handler correctly.
+ */
+ mdc_set_open_replay_data(NULL, NULL, it);
+ }
+
+ if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
+ void *eadata;
+
+ mdc_update_max_ea_from_body(exp, body);
+
+ /*
+ * The eadata is opaque; just check that it is there.
+ * Eventually, obd_unpackmd() will check the contents.
+ */
+ eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+ body->eadatasize);
+ if (eadata == NULL)
+ return -EPROTO;
+
+ /* save lvb data and length in case this is for layout
+ * lock */
+ lvb_data = eadata;
+ lvb_len = body->eadatasize;
+
+ /*
+ * We save the reply LOV EA in case we have to replay a
+ * create for recovery. If we didn't allocate a large
+ * enough request buffer above we need to reallocate it
+ * here to hold the actual LOV EA.
+ *
+ * To not save LOV EA if request is not going to replay
+ * (for example error one).
+ */
+ if ((it->it_op & IT_OPEN) && req->rq_replay) {
+ void *lmm;
+
+ if (req_capsule_get_size(pill, &RMF_EADATA,
+ RCL_CLIENT) <
+ body->eadatasize)
+ mdc_realloc_openmsg(req, body);
+ else
+ req_capsule_shrink(pill, &RMF_EADATA,
+ body->eadatasize,
+ RCL_CLIENT);
+
+ req_capsule_set_size(pill, &RMF_EADATA,
+ RCL_CLIENT,
+ body->eadatasize);
+
+ lmm = req_capsule_client_get(pill, &RMF_EADATA);
+ if (lmm)
+ memcpy(lmm, eadata, body->eadatasize);
+ }
+ }
+
+ if (body->valid & OBD_MD_FLRMTPERM) {
+ struct mdt_remote_perm *perm;
+
+ LASSERT(client_is_remote(exp));
+ perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+ lustre_swab_mdt_remote_perm);
+ if (perm == NULL)
+ return -EPROTO;
+ }
+ if (body->valid & OBD_MD_FLMDSCAPA) {
+ struct lustre_capa *capa, *p;
+
+ capa = req_capsule_server_get(pill, &RMF_CAPA1);
+ if (capa == NULL)
+ return -EPROTO;
+
+ if (it->it_op & IT_OPEN) {
+ /* client fid capa will be checked in replay */
+ p = req_capsule_client_get(pill, &RMF_CAPA2);
+ LASSERT(p);
+ *p = *capa;
+ }
+ }
+ if (body->valid & OBD_MD_FLOSSCAPA) {
+ struct lustre_capa *capa;
+
+ capa = req_capsule_server_get(pill, &RMF_CAPA2);
+ if (capa == NULL)
+ return -EPROTO;
+ }
+ } else if (it->it_op & IT_LAYOUT) {
+ /* maybe the lock was granted right away and layout
+ * is packed into RMF_DLM_LVB of req */
+ lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+ if (lvb_len > 0) {
+ lvb_data = req_capsule_server_sized_get(pill,
+ &RMF_DLM_LVB, lvb_len);
+ if (lvb_data == NULL)
+ return -EPROTO;
+ }
+ }
+
+ /* fill in stripe data for layout lock */
+ lock = ldlm_handle2lock(lockh);
+ if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) {
+ void *lmm;
+
+ LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n",
+ ldlm_it2str(it->it_op), lvb_len);
+
+ OBD_ALLOC_LARGE(lmm, lvb_len);
+ if (lmm == NULL) {
+ LDLM_LOCK_PUT(lock);
+ return -ENOMEM;
+ }
+ memcpy(lmm, lvb_data, lvb_len);
+
+ /* install lvb_data */
+ lock_res_and_lock(lock);
+ if (lock->l_lvb_data == NULL) {
+ lock->l_lvb_type = LVB_T_LAYOUT;
+ lock->l_lvb_data = lmm;
+ lock->l_lvb_len = lvb_len;
+ lmm = NULL;
+ }
+ unlock_res_and_lock(lock);
+ if (lmm != NULL)
+ OBD_FREE_LARGE(lmm, lvb_len);
+ }
+ if (lock != NULL)
+ LDLM_LOCK_PUT(lock);
+
+ return rc;
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+ struct lookup_intent *it, struct md_op_data *op_data,
+ struct lustre_handle *lockh, void *lmm, int lmmsize,
+ struct ptlrpc_request **reqp, u64 extra_lock_flags)
+{
+ static const ldlm_policy_data_t lookup_policy = {
+ .l_inodebits = { MDS_INODELOCK_LOOKUP }
+ };
+ static const ldlm_policy_data_t update_policy = {
+ .l_inodebits = { MDS_INODELOCK_UPDATE }
+ };
+ static const ldlm_policy_data_t layout_policy = {
+ .l_inodebits = { MDS_INODELOCK_LAYOUT }
+ };
+ static const ldlm_policy_data_t getxattr_policy = {
+ .l_inodebits = { MDS_INODELOCK_XATTR }
+ };
+ ldlm_policy_data_t const *policy = &lookup_policy;
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct ptlrpc_request *req;
+ u64 flags, saved_flags = extra_lock_flags;
+ struct ldlm_res_id res_id;
+ int generation, resends = 0;
+ struct ldlm_reply *lockrep;
+ enum lvb_type lvb_type = LVB_T_NONE;
+ int rc;
+
+ LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+ einfo->ei_type);
+
+ fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+ if (it) {
+ saved_flags |= LDLM_FL_HAS_INTENT;
+ if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+ policy = &update_policy;
+ else if (it->it_op & IT_LAYOUT)
+ policy = &layout_policy;
+ else if (it->it_op & (IT_GETXATTR | IT_SETXATTR))
+ policy = &getxattr_policy;
+ }
+
+ LASSERT(reqp == NULL);
+
+ generation = obddev->u.cli.cl_import->imp_generation;
+resend:
+ flags = saved_flags;
+ if (!it) {
+ /* The only way right now is FLOCK, in this case we hide flock
+ policy as lmm, but lmmsize is 0 */
+ LASSERT(lmm && lmmsize == 0);
+ LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+ einfo->ei_type);
+ policy = (ldlm_policy_data_t *)lmm;
+ res_id.name[3] = LDLM_FLOCK;
+ req = NULL;
+ } else if (it->it_op & IT_OPEN) {
+ req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize,
+ einfo->ei_cbdata);
+ policy = &update_policy;
+ einfo->ei_cbdata = NULL;
+ lmm = NULL;
+ } else if (it->it_op & IT_UNLINK) {
+ req = mdc_intent_unlink_pack(exp, it, op_data);
+ } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+ req = mdc_intent_getattr_pack(exp, it, op_data);
+ } else if (it->it_op & IT_READDIR) {
+ req = mdc_enqueue_pack(exp, 0);
+ } else if (it->it_op & IT_LAYOUT) {
+ if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+ return -EOPNOTSUPP;
+ req = mdc_intent_layout_pack(exp, it, op_data);
+ lvb_type = LVB_T_LAYOUT;
+ } else if (it->it_op & IT_GETXATTR) {
+ req = mdc_intent_getxattr_pack(exp, it, op_data);
+ } else {
+ LBUG();
+ return -EINVAL;
+ }
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ if (req != NULL && it && it->it_op & IT_CREAT)
+ /* ask ptlrpc not to resend on EINPROGRESS since we have our own
+ * retry logic */
+ req->rq_no_retry_einprogress = 1;
+
+ if (resends) {
+ req->rq_generation_set = 1;
+ req->rq_import_generation = generation;
+ req->rq_sent = get_seconds() + resends;
+ }
+
+ /* It is important to obtain rpc_lock first (if applicable), so that
+ * threads that are serialised with rpc_lock are not polluting our
+ * rpcs in flight counter. We do not do flock request limiting, though*/
+ if (it) {
+ mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+ rc = mdc_enter_request(&obddev->u.cli);
+ if (rc != 0) {
+ mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+ mdc_clear_replay_flag(req, 0);
+ ptlrpc_req_finished(req);
+ return rc;
+ }
+ }
+
+ rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+ 0, lvb_type, lockh, 0);
+ if (!it) {
+ /* For flock requests we immediately return without further
+ delay and let caller deal with the rest, since rest of
+ this function metadata processing makes no sense for flock
+ requests anyway. But in case of problem during comms with
+ Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
+ can not rely on caller and this mainly for F_UNLCKs
+ (explicits or automatically generated by Kernel to clean
+ current FLocks upon exit) that can't be trashed */
+ if ((rc == -EINTR) || (rc == -ETIMEDOUT))
+ goto resend;
+ return rc;
+ }
+
+ mdc_exit_request(&obddev->u.cli);
+ mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+
+ if (rc < 0) {
+ CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
+ "%s: ldlm_cli_enqueue failed: rc = %d\n",
+ obddev->obd_name, rc);
+
+ mdc_clear_replay_flag(req, rc);
+ ptlrpc_req_finished(req);
+ return rc;
+ }
+
+ lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+ LASSERT(lockrep != NULL);
+
+ lockrep->lock_policy_res2 =
+ ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+ /* Retry the create infinitely when we get -EINPROGRESS from
+ * server. This is required by the new quota design. */
+ if (it && it->it_op & IT_CREAT &&
+ (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+ mdc_clear_replay_flag(req, rc);
+ ptlrpc_req_finished(req);
+ resends++;
+
+ CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+ obddev->obd_name, resends, it->it_op,
+ PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+ if (generation == obddev->u.cli.cl_import->imp_generation) {
+ goto resend;
+ } else {
+ CDEBUG(D_HA, "resend cross eviction\n");
+ return -EIO;
+ }
+ }
+
+ rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+ if (rc < 0) {
+ if (lustre_handle_is_used(lockh)) {
+ ldlm_lock_decref(lockh, einfo->ei_mode);
+ memset(lockh, 0, sizeof(*lockh));
+ }
+ ptlrpc_req_finished(req);
+
+ it->d.lustre.it_lock_handle = 0;
+ it->d.lustre.it_lock_mode = 0;
+ it->d.lustre.it_data = NULL;
+ }
+
+ return rc;
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+ struct ptlrpc_request *request,
+ struct md_op_data *op_data,
+ struct lookup_intent *it,
+ struct lustre_handle *lockh)
+{
+ struct lustre_handle old_lock;
+ struct mdt_body *mdt_body;
+ struct ldlm_lock *lock;
+ int rc;
+
+ LASSERT(request != NULL);
+ LASSERT(request != LP_POISON);
+ LASSERT(request->rq_repmsg != LP_POISON);
+
+ if (!it_disposition(it, DISP_IT_EXECD)) {
+ /* The server failed before it even started executing the
+ * intent, i.e. because it couldn't unpack the request. */
+ LASSERT(it->d.lustre.it_status != 0);
+ return it->d.lustre.it_status;
+ }
+ rc = it_open_error(DISP_IT_EXECD, it);
+ if (rc)
+ return rc;
+
+ mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+ LASSERT(mdt_body != NULL); /* mdc_enqueue checked */
+
+ /* If we were revalidating a fid/name pair, mark the intent in
+ * case we fail and get called again from lookup */
+ if (fid_is_sane(&op_data->op_fid2) &&
+ it->it_create_mode & M_CHECK_STALE &&
+ it->it_op != IT_GETATTR) {
+
+ /* Also: did we find the same inode? */
+ /* sever can return one of two fids:
+ * op_fid2 - new allocated fid - if file is created.
+ * op_fid3 - existent fid - if file only open.
+ * op_fid3 is saved in lmv_intent_open */
+ if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) &&
+ (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) {
+ CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID
+ "\n", PFID(&op_data->op_fid2),
+ PFID(&op_data->op_fid2), PFID(&mdt_body->fid1));
+ return -ESTALE;
+ }
+ }
+
+ rc = it_open_error(DISP_LOOKUP_EXECD, it);
+ if (rc)
+ return rc;
+
+ /* keep requests around for the multiple phases of the call
+ * this shows the DISP_XX must guarantee we make it into the call
+ */
+ if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+ it_disposition(it, DISP_OPEN_CREATE) &&
+ !it_open_error(DISP_OPEN_CREATE, it)) {
+ it_set_disposition(it, DISP_ENQ_CREATE_REF);
+ ptlrpc_request_addref(request); /* balanced in ll_create_node */
+ }
+ if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+ it_disposition(it, DISP_OPEN_OPEN) &&
+ !it_open_error(DISP_OPEN_OPEN, it)) {
+ it_set_disposition(it, DISP_ENQ_OPEN_REF);
+ ptlrpc_request_addref(request); /* balanced in ll_file_open */
+ /* BUG 11546 - eviction in the middle of open rpc processing */
+ OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout);
+ }
+
+ if (it->it_op & IT_CREAT) {
+ /* XXX this belongs in ll_create_it */
+ } else if (it->it_op == IT_OPEN) {
+ LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+ } else {
+ LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT));
+ }
+
+ /* If we already have a matching lock, then cancel the new
+ * one. We have to set the data here instead of in
+ * mdc_enqueue, because we need to use the child's inode as
+ * the l_ast_data to match, and that's not available until
+ * intent_finish has performed the iget().) */
+ lock = ldlm_handle2lock(lockh);
+ if (lock) {
+ ldlm_policy_data_t policy = lock->l_policy_data;
+
+ LDLM_DEBUG(lock, "matching against this");
+
+ LASSERTF(fid_res_name_eq(&mdt_body->fid1,
+ &lock->l_resource->lr_name),
+ "Lock res_id: "DLDLMRES", fid: "DFID"\n",
+ PLDLMRES(lock->l_resource), PFID(&mdt_body->fid1));
+ LDLM_LOCK_PUT(lock);
+
+ memcpy(&old_lock, lockh, sizeof(*lockh));
+ if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+ LDLM_IBITS, &policy, LCK_NL,
+ &old_lock, 0)) {
+ ldlm_lock_decref_and_cancel(lockh,
+ it->d.lustre.it_lock_mode);
+ memcpy(lockh, &old_lock, sizeof(old_lock));
+ it->d.lustre.it_lock_handle = lockh->cookie;
+ }
+ }
+ CDEBUG(D_DENTRY,
+ "D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
+ op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op),
+ it->d.lustre.it_status, it->d.lustre.it_disposition, rc);
+ return rc;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+ struct lu_fid *fid, __u64 *bits)
+{
+ /* We could just return 1 immediately, but since we should only
+ * be called in revalidate_it if we already have a lock, let's
+ * verify that. */
+ struct ldlm_res_id res_id;
+ struct lustre_handle lockh;
+ ldlm_policy_data_t policy;
+ ldlm_mode_t mode;
+
+ if (it->d.lustre.it_lock_handle) {
+ lockh.cookie = it->d.lustre.it_lock_handle;
+ mode = ldlm_revalidate_lock_handle(&lockh, bits);
+ } else {
+ fid_build_reg_res_name(fid, &res_id);
+ switch (it->it_op) {
+ case IT_GETATTR:
+ /* File attributes are held under multiple bits:
+ * nlink is under lookup lock, size and times are
+ * under UPDATE lock and recently we've also got
+ * a separate permissions lock for owner/group/acl that
+ * were protected by lookup lock before.
+ * Getattr must provide all of that information,
+ * so we need to ensure we have all of those locks.
+ * Unfortunately, if the bits are split across multiple
+ * locks, there's no easy way to match all of them here,
+ * so an extra RPC would be performed to fetch all
+ * of those bits at once for now. */
+ /* For new MDTs(> 2.4), UPDATE|PERM should be enough,
+ * but for old MDTs (< 2.4), permission is covered
+ * by LOOKUP lock, so it needs to match all bits here.*/
+ policy.l_inodebits.bits = MDS_INODELOCK_UPDATE |
+ MDS_INODELOCK_LOOKUP |
+ MDS_INODELOCK_PERM;
+ break;
+ case IT_LAYOUT:
+ policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+ break;
+ default:
+ policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+ break;
+ }
+
+ mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid,
+ LDLM_IBITS, &policy,
+ LCK_CR | LCK_CW | LCK_PR | LCK_PW,
+ &lockh);
+ }
+
+ if (mode) {
+ it->d.lustre.it_lock_handle = lockh.cookie;
+ it->d.lustre.it_lock_mode = mode;
+ } else {
+ it->d.lustre.it_lock_handle = 0;
+ it->d.lustre.it_lock_mode = 0;
+ }
+
+ return !!mode;
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what d.lustre.it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode. In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+ void *lmm, int lmmsize, struct lookup_intent *it,
+ int lookup_flags, struct ptlrpc_request **reqp,
+ ldlm_blocking_callback cb_blocking,
+ __u64 extra_lock_flags)
+{
+ struct ldlm_enqueue_info einfo = {
+ .ei_type = LDLM_IBITS,
+ .ei_mode = it_to_lock_mode(it),
+ .ei_cb_bl = cb_blocking,
+ .ei_cb_cp = ldlm_completion_ast,
+ };
+ struct lustre_handle lockh;
+ int rc = 0;
+
+ LASSERT(it);
+
+ CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+ ", intent: %s flags %#Lo\n", op_data->op_namelen,
+ op_data->op_name, PFID(&op_data->op_fid2),
+ PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+ it->it_flags);
+
+ lockh.cookie = 0;
+ if (fid_is_sane(&op_data->op_fid2) &&
+ (it->it_op & (IT_LOOKUP | IT_GETATTR))) {
+ /* We could just return 1 immediately, but since we should only
+ * be called in revalidate_it if we already have a lock, let's
+ * verify that. */
+ it->d.lustre.it_lock_handle = 0;
+ rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+ /* Only return failure if it was not GETATTR by cfid
+ (from inode_revalidate) */
+ if (rc || op_data->op_namelen != 0)
+ return rc;
+ }
+
+ /* For case if upper layer did not alloc fid, do it now. */
+ if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+ rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+ if (rc < 0) {
+ CERROR("Can't alloc new fid, rc %d\n", rc);
+ return rc;
+ }
+ }
+ rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh, lmm, lmmsize, NULL,
+ extra_lock_flags);
+ if (rc < 0)
+ return rc;
+
+ *reqp = it->d.lustre.it_data;
+ rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+ return rc;
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *args, int rc)
+{
+ struct mdc_getattr_args *ga = args;
+ struct obd_export *exp = ga->ga_exp;
+ struct md_enqueue_info *minfo = ga->ga_minfo;
+ struct ldlm_enqueue_info *einfo = ga->ga_einfo;
+ struct lookup_intent *it;
+ struct lustre_handle *lockh;
+ struct obd_device *obddev;
+ struct ldlm_reply *lockrep;
+ __u64 flags = LDLM_FL_HAS_INTENT;
+
+ it = &minfo->mi_it;
+ lockh = &minfo->mi_lockh;
+
+ obddev = class_exp2obd(exp);
+
+ mdc_exit_request(&obddev->u.cli);
+ if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+ rc = -ETIMEDOUT;
+
+ rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
+ &flags, NULL, 0, lockh, rc);
+ if (rc < 0) {
+ CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
+ mdc_clear_replay_flag(req, rc);
+ goto out;
+ }
+
+ lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+ LASSERT(lockrep != NULL);
+
+ lockrep->lock_policy_res2 =
+ ptlrpc_status_ntoh(lockrep->lock_policy_res2);
+
+ rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+ if (rc)
+ goto out;
+
+ rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+
+out:
+ OBD_FREE_PTR(einfo);
+ minfo->mi_cb(req, minfo, rc);
+ return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+ struct md_enqueue_info *minfo,
+ struct ldlm_enqueue_info *einfo)
+{
+ struct md_op_data *op_data = &minfo->mi_data;
+ struct lookup_intent *it = &minfo->mi_it;
+ struct ptlrpc_request *req;
+ struct mdc_getattr_args *ga;
+ struct obd_device *obddev = class_exp2obd(exp);
+ struct ldlm_res_id res_id;
+ /*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed
+ * for statahead currently. Consider CMD in future, such two bits
+ * maybe managed by different MDS, should be adjusted then. */
+ ldlm_policy_data_t policy = {
+ .l_inodebits = { MDS_INODELOCK_LOOKUP |
+ MDS_INODELOCK_UPDATE }
+ };
+ int rc = 0;
+ __u64 flags = LDLM_FL_HAS_INTENT;
+
+ CDEBUG(D_DLMTRACE,
+ "name: %.*s in inode "DFID", intent: %s flags %#Lo\n",
+ op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+ ldlm_it2str(it->it_op), it->it_flags);
+
+ fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+ req = mdc_intent_getattr_pack(exp, it, op_data);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ rc = mdc_enter_request(&obddev->u.cli);
+ if (rc != 0) {
+ ptlrpc_req_finished(req);
+ return rc;
+ }
+
+ rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL,
+ 0, LVB_T_NONE, &minfo->mi_lockh, 1);
+ if (rc < 0) {
+ mdc_exit_request(&obddev->u.cli);
+ ptlrpc_req_finished(req);
+ return rc;
+ }
+
+ CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
+ ga = ptlrpc_req_async_args(req);
+ ga->ga_exp = exp;
+ ga->ga_minfo = minfo;
+ ga->ga_einfo = einfo;
+
+ req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+ ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+ return 0;
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c
new file mode 100644
index 000000000..5e9c6296c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_reint.c
@@ -0,0 +1,483 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/kernel.h>
+
+#include "../include/obd_class.h"
+#include "mdc_internal.h"
+#include "../include/lustre_fid.h"
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request,
+ struct mdc_rpc_lock *rpc_lock,
+ int level)
+{
+ int rc;
+
+ request->rq_send_state = level;
+
+ mdc_get_rpc_lock(rpc_lock, NULL);
+ rc = ptlrpc_queue_wait(request);
+ mdc_put_rpc_lock(rpc_lock, NULL);
+ if (rc)
+ CDEBUG(D_INFO, "error in handling %d\n", rc);
+ else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY))
+ rc = -EPROTO;
+
+ return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid,
+ struct list_head *cancels, ldlm_mode_t mode,
+ __u64 bits)
+{
+ struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+ ldlm_policy_data_t policy = {};
+ struct ldlm_res_id res_id;
+ struct ldlm_resource *res;
+ int count;
+
+ /* Return, i.e. cancel nothing, only if ELC is supported (flag in
+ * export) but disabled through procfs (flag in NS).
+ *
+ * This distinguishes from a case when ELC is not supported originally,
+ * when we still want to cancel locks in advance and just cancel them
+ * locally, without sending any RPC. */
+ if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+ return 0;
+
+ fid_build_reg_res_name(fid, &res_id);
+ res = ldlm_resource_get(exp->exp_obd->obd_namespace,
+ NULL, &res_id, 0, 0);
+ if (res == NULL)
+ return 0;
+ LDLM_RESOURCE_ADDREF(res);
+ /* Initialize ibits lock policy. */
+ policy.l_inodebits.bits = bits;
+ count = ldlm_cancel_resource_local(res, cancels, &policy,
+ mode, 0, 0, NULL);
+ LDLM_RESOURCE_DELREF(res);
+ ldlm_resource_putref(res);
+ return count;
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+ void *ea, int ealen, void *ea2, int ea2len,
+ struct ptlrpc_request **request, struct md_open_data **mod)
+{
+ LIST_HEAD(cancels);
+ struct ptlrpc_request *req;
+ struct mdc_rpc_lock *rpc_lock;
+ struct obd_device *obd = exp->exp_obd;
+ int count = 0, rc;
+ __u64 bits;
+
+ LASSERT(op_data != NULL);
+
+ bits = MDS_INODELOCK_UPDATE;
+ if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+ bits |= MDS_INODELOCK_LOOKUP;
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+ (fid_is_sane(&op_data->op_fid1)) &&
+ !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+ count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, LCK_EX, bits);
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_REINT_SETATTR);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return -ENOMEM;
+ }
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0)
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT,
+ 0);
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+ req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT,
+ ea2len);
+
+ rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ rpc_lock = obd->u.cli.cl_rpc_lock;
+
+ if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+ CDEBUG(D_INODE, "setting mtime "CFS_TIME_T
+ ", ctime "CFS_TIME_T"\n",
+ LTIME_S(op_data->op_attr.ia_mtime),
+ LTIME_S(op_data->op_attr.ia_ctime));
+ mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len);
+
+ ptlrpc_request_set_replen(req);
+ if (mod && (op_data->op_flags & MF_EPOCH_OPEN) &&
+ req->rq_import->imp_replayable) {
+ LASSERT(*mod == NULL);
+
+ *mod = obd_mod_alloc();
+ if (*mod == NULL) {
+ DEBUG_REQ(D_ERROR, req, "Can't allocate md_open_data");
+ } else {
+ req->rq_replay = 1;
+ req->rq_cb_data = *mod;
+ (*mod)->mod_open_req = req;
+ req->rq_commit_cb = mdc_commit_open;
+ (*mod)->mod_is_create = true;
+ /**
+ * Take an extra reference on \var mod, it protects \var
+ * mod from being freed on eviction (commit callback is
+ * called despite rq_replay flag).
+ * Will be put on mdc_done_writing().
+ */
+ obd_mod_get(*mod);
+ }
+ }
+
+ rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL);
+
+ /* Save the obtained info in the original RPC for the replay case. */
+ if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) {
+ struct mdt_ioepoch *epoch;
+ struct mdt_body *body;
+
+ epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(epoch != NULL);
+ LASSERT(body != NULL);
+ epoch->handle = body->handle;
+ epoch->ioepoch = body->ioepoch;
+ req->rq_replay_cb = mdc_replay_open;
+ /** bug 3633, open may be committed and estale answer is not error */
+ } else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) {
+ rc = 0;
+ } else if (rc == -ERESTARTSYS) {
+ rc = 0;
+ }
+ *request = req;
+ if (rc && req->rq_commit_cb) {
+ /* Put an extra reference on \var mod on error case. */
+ if (mod != NULL && *mod != NULL)
+ obd_mod_put(*mod);
+ req->rq_commit_cb(req);
+ }
+ return rc;
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+ const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+ cfs_cap_t cap_effective, __u64 rdev,
+ struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ int level, rc;
+ int count, resends = 0;
+ struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+ int generation = import->imp_generation;
+ LIST_HEAD(cancels);
+
+ /* For case if upper layer did not alloc fid, do it now. */
+ if (!fid_is_sane(&op_data->op_fid2)) {
+ /*
+ * mdc_fid_alloc() may return errno 1 in case of switch to new
+ * sequence, handle this.
+ */
+ rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+ if (rc < 0) {
+ CERROR("Can't alloc new fid, rc %d\n", rc);
+ return rc;
+ }
+ }
+
+rebuild:
+ count = 0;
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+ (fid_is_sane(&op_data->op_fid1)))
+ count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_UPDATE);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_REINT_CREATE_RMT_ACL);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return -ENOMEM;
+ }
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+ data && datalen ? datalen : 0);
+
+ rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ /*
+ * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+ * tgt, for symlinks or lov MD data.
+ */
+ mdc_create_pack(req, op_data, data, datalen, mode, uid,
+ gid, cap_effective, rdev);
+
+ ptlrpc_request_set_replen(req);
+
+ /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+ * logic here */
+ req->rq_no_retry_einprogress = 1;
+
+ if (resends) {
+ req->rq_generation_set = 1;
+ req->rq_import_generation = generation;
+ req->rq_sent = get_seconds() + resends;
+ }
+ level = LUSTRE_IMP_FULL;
+ resend:
+ rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level);
+
+ /* Resend if we were told to. */
+ if (rc == -ERESTARTSYS) {
+ level = LUSTRE_IMP_RECOVER;
+ goto resend;
+ } else if (rc == -EINPROGRESS) {
+ /* Retry create infinitely until succeed or get other
+ * error code. */
+ ptlrpc_req_finished(req);
+ resends++;
+
+ CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+ exp->exp_obd->obd_name, resends,
+ PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+ if (generation == import->imp_generation) {
+ goto rebuild;
+ } else {
+ CDEBUG(D_HA, "resend cross eviction\n");
+ return -EIO;
+ }
+ } else if (rc == 0) {
+ struct mdt_body *body;
+ struct lustre_capa *capa;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body);
+ if (body->valid & OBD_MD_FLMDSCAPA) {
+ capa = req_capsule_server_get(&req->rq_pill,
+ &RMF_CAPA1);
+ if (capa == NULL)
+ rc = -EPROTO;
+ }
+ }
+
+ *request = req;
+ return rc;
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ LIST_HEAD(cancels);
+ struct obd_device *obd = class_exp2obd(exp);
+ struct ptlrpc_request *req = *request;
+ int count = 0, rc;
+
+ LASSERT(req == NULL);
+
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+ (fid_is_sane(&op_data->op_fid1)) &&
+ !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+ count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_UPDATE);
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+ (fid_is_sane(&op_data->op_fid3)) &&
+ !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+ count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_FULL);
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_REINT_UNLINK);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return -ENOMEM;
+ }
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+
+ rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_unlink_pack(req, op_data);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+ obd->u.cli.cl_default_mds_easize);
+ req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+ obd->u.cli.cl_default_mds_cookiesize);
+ ptlrpc_request_set_replen(req);
+
+ *request = req;
+
+ rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+ if (rc == -ERESTARTSYS)
+ rc = 0;
+ return rc;
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ LIST_HEAD(cancels);
+ struct obd_device *obd = exp->exp_obd;
+ struct ptlrpc_request *req;
+ int count = 0, rc;
+
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+ (fid_is_sane(&op_data->op_fid2)))
+ count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_UPDATE);
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+ (fid_is_sane(&op_data->op_fid1)))
+ count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_UPDATE);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return -ENOMEM;
+ }
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+
+ rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_link_pack(req, op_data);
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+ *request = req;
+ if (rc == -ERESTARTSYS)
+ rc = 0;
+
+ return rc;
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+ const char *old, int oldlen, const char *new, int newlen,
+ struct ptlrpc_request **request)
+{
+ LIST_HEAD(cancels);
+ struct obd_device *obd = exp->exp_obd;
+ struct ptlrpc_request *req;
+ int count = 0, rc;
+
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+ (fid_is_sane(&op_data->op_fid1)))
+ count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_UPDATE);
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+ (fid_is_sane(&op_data->op_fid2)))
+ count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_UPDATE);
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+ (fid_is_sane(&op_data->op_fid3)))
+ count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_LOOKUP);
+ if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+ (fid_is_sane(&op_data->op_fid4)))
+ count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_FULL);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_REINT_RENAME);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return -ENOMEM;
+ }
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+ req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+
+ rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ if (exp_connect_cancelset(exp) && req)
+ ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+ mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+ obd->u.cli.cl_default_mds_easize);
+ req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+ obd->u.cli.cl_default_mds_cookiesize);
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+ *request = req;
+ if (rc == -ERESTARTSYS)
+ rc = 0;
+
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
new file mode 100644
index 000000000..f8ef5fe5e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -0,0 +1,2731 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+# include <linux/utsname.h>
+
+#include "../include/lustre_acl.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_fid.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_log.h"
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+struct mdc_renew_capa_args {
+ struct obd_capa *ra_oc;
+ renew_capa_cb_t ra_cb;
+};
+
+static int mdc_cleanup(struct obd_device *obd);
+
+static int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+ const struct req_msg_field *field, struct obd_capa **oc)
+{
+ struct lustre_capa *capa;
+ struct obd_capa *c;
+
+ /* swabbed already in mdc_enqueue */
+ capa = req_capsule_server_get(&req->rq_pill, field);
+ if (capa == NULL)
+ return -EPROTO;
+
+ c = alloc_capa(CAPA_SITE_CLIENT);
+ if (IS_ERR(c)) {
+ CDEBUG(D_INFO, "alloc capa failed!\n");
+ return PTR_ERR(c);
+ } else {
+ c->c_capa = *capa;
+ *oc = c;
+ return 0;
+ }
+}
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+ struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+ int rc;
+
+ /* mdc_enter_request() ensures that this client has no more
+ * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+ * against an MDT. */
+ rc = mdc_enter_request(cli);
+ if (rc != 0)
+ return rc;
+
+ rc = ptlrpc_queue_wait(req);
+ mdc_exit_request(cli);
+
+ return rc;
+}
+
+/* Helper that implements most of mdc_getstatus and signal_completed_replay. */
+/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */
+static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid,
+ struct obd_capa **pc, int level, int msg_flags)
+{
+ struct ptlrpc_request *req;
+ struct mdt_body *body;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS,
+ LUSTRE_MDS_VERSION, MDS_GETSTATUS);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0);
+ lustre_msg_add_flags(req->rq_reqmsg, msg_flags);
+ req->rq_send_state = level;
+
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if (body->valid & OBD_MD_FLMDSCAPA) {
+ rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc);
+ if (rc)
+ goto out;
+ }
+
+ *rootfid = body->fid1;
+ CDEBUG(D_NET,
+ "root fid="DFID", last_committed=%llu\n",
+ PFID(rootfid),
+ lustre_msg_get_last_committed(req->rq_repmsg));
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+/* This should be mdc_get_info("rootfid") */
+static int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid,
+ struct obd_capa **pc)
+{
+ return send_getstatus(class_exp2cliimp(exp), rootfid, pc,
+ LUSTRE_IMP_FULL, 0);
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts. --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+ struct ptlrpc_request *req)
+{
+ struct req_capsule *pill = &req->rq_pill;
+ struct mdt_body *body;
+ void *eadata;
+ int rc;
+
+ /* Request message already built. */
+ rc = ptlrpc_queue_wait(req);
+ if (rc != 0)
+ return rc;
+
+ /* sanity check for the reply */
+ body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+ if (body == NULL)
+ return -EPROTO;
+
+ CDEBUG(D_NET, "mode: %o\n", body->mode);
+
+ if (body->eadatasize != 0) {
+ mdc_update_max_ea_from_body(exp, body);
+
+ eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+ body->eadatasize);
+ if (eadata == NULL)
+ return -EPROTO;
+ }
+
+ if (body->valid & OBD_MD_FLRMTPERM) {
+ struct mdt_remote_perm *perm;
+
+ LASSERT(client_is_remote(exp));
+ perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+ lustre_swab_mdt_remote_perm);
+ if (perm == NULL)
+ return -EPROTO;
+ }
+
+ if (body->valid & OBD_MD_FLMDSCAPA) {
+ struct lustre_capa *capa;
+
+ capa = req_capsule_server_get(pill, &RMF_CAPA1);
+ if (capa == NULL)
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ /* Single MDS without an LMV case */
+ if (op_data->op_flags & MF_GET_MDT_IDX) {
+ op_data->op_mds = 0;
+ return 0;
+ }
+ *request = NULL;
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+ op_data->op_valid, op_data->op_mode, -1, 0);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+ op_data->op_mode);
+ if (op_data->op_valid & OBD_MD_FLRMTPERM) {
+ LASSERT(client_is_remote(exp));
+ req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+ sizeof(struct mdt_remote_perm));
+ }
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_getattr_common(exp, req);
+ if (rc)
+ ptlrpc_req_finished(req);
+ else
+ *request = req;
+ return rc;
+}
+
+static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ *request = NULL;
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_GETATTR_NAME);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ op_data->op_namelen + 1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+ op_data->op_valid, op_data->op_mode,
+ op_data->op_suppgids[0], 0);
+
+ if (op_data->op_name) {
+ char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+
+ LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+ op_data->op_namelen);
+ memcpy(name, op_data->op_name, op_data->op_namelen);
+ }
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+ op_data->op_mode);
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_getattr_common(exp, req);
+ if (rc)
+ ptlrpc_req_finished(req);
+ else
+ *request = req;
+ return rc;
+}
+
+static int mdc_is_subdir(struct obd_export *exp,
+ const struct lu_fid *pfid,
+ const struct lu_fid *cfid,
+ struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ *request = NULL;
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION,
+ MDS_IS_SUBDIR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_is_subdir_pack(req, pfid, cfid, 0);
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc && rc != -EREMOTE)
+ ptlrpc_req_finished(req);
+ else
+ *request = req;
+ return rc;
+}
+
+static int mdc_xattr_common(struct obd_export *exp,
+ const struct req_format *fmt,
+ const struct lu_fid *fid,
+ struct obd_capa *oc, int opcode, u64 valid,
+ const char *xattr_name, const char *input,
+ int input_size, int output_size, int flags,
+ __u32 suppgid, struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ int xattr_namelen = 0;
+ char *tmp;
+ int rc;
+
+ *request = NULL;
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, oc);
+ if (xattr_name) {
+ xattr_namelen = strlen(xattr_name) + 1;
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ xattr_namelen);
+ }
+ if (input_size) {
+ LASSERT(input);
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+ input_size);
+ }
+
+ /* Flush local XATTR locks to get rid of a possible cancel RPC */
+ if (opcode == MDS_REINT && fid_is_sane(fid) &&
+ exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) {
+ LIST_HEAD(cancels);
+ int count;
+
+ /* Without that packing would fail */
+ if (input_size == 0)
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+ RCL_CLIENT, 0);
+
+ count = mdc_resource_get_unused(exp, fid,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_XATTR);
+
+ rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+ } else {
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+ }
+
+ if (opcode == MDS_REINT) {
+ struct mdt_rec_setxattr *rec;
+
+ CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+ sizeof(struct mdt_rec_reint));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+ rec->sx_opcode = REINT_SETXATTR;
+ rec->sx_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ rec->sx_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ rec->sx_cap = cfs_curproc_cap_pack();
+ rec->sx_suppgid1 = suppgid;
+ rec->sx_suppgid2 = -1;
+ rec->sx_fid = *fid;
+ rec->sx_valid = valid | OBD_MD_FLCTIME;
+ rec->sx_time = get_seconds();
+ rec->sx_size = output_size;
+ rec->sx_flags = flags;
+
+ mdc_pack_capa(req, &RMF_CAPA1, oc);
+ } else {
+ mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags);
+ }
+
+ if (xattr_name) {
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+ memcpy(tmp, xattr_name, xattr_namelen);
+ }
+ if (input_size) {
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+ memcpy(tmp, input, input_size);
+ }
+
+ if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+ RCL_SERVER, output_size);
+ ptlrpc_request_set_replen(req);
+
+ /* make rpc */
+ if (opcode == MDS_REINT)
+ mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+ rc = ptlrpc_queue_wait(req);
+
+ if (opcode == MDS_REINT)
+ mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+ if (rc)
+ ptlrpc_req_finished(req);
+ else
+ *request = req;
+ return rc;
+}
+
+static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, u64 valid, const char *xattr_name,
+ const char *input, int input_size, int output_size,
+ int flags, __u32 suppgid, struct ptlrpc_request **request)
+{
+ return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+ fid, oc, MDS_REINT, valid, xattr_name,
+ input, input_size, output_size, flags,
+ suppgid, request);
+}
+
+static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, u64 valid, const char *xattr_name,
+ const char *input, int input_size, int output_size,
+ int flags, struct ptlrpc_request **request)
+{
+ return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+ fid, oc, MDS_GETXATTR, valid, xattr_name,
+ input, input_size, output_size, flags,
+ -1, request);
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
+{
+ struct req_capsule *pill = &req->rq_pill;
+ struct mdt_body *body = md->body;
+ struct posix_acl *acl;
+ void *buf;
+ int rc;
+
+ if (!body->aclsize)
+ return 0;
+
+ buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize);
+
+ if (!buf)
+ return -EPROTO;
+
+ acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize);
+ if (acl == NULL)
+ return 0;
+
+ if (IS_ERR(acl)) {
+ rc = PTR_ERR(acl);
+ CERROR("convert xattr to acl: %d\n", rc);
+ return rc;
+ }
+
+ rc = posix_acl_valid(acl);
+ if (rc) {
+ CERROR("validate acl: %d\n", rc);
+ posix_acl_release(acl);
+ return rc;
+ }
+
+ md->posix_acl = acl;
+ return 0;
+}
+#else
+#define mdc_unpack_acl(req, md) 0
+#endif
+
+int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+ struct obd_export *dt_exp, struct obd_export *md_exp,
+ struct lustre_md *md)
+{
+ struct req_capsule *pill = &req->rq_pill;
+ int rc;
+
+ LASSERT(md);
+ memset(md, 0, sizeof(*md));
+
+ md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+ LASSERT(md->body != NULL);
+
+ if (md->body->valid & OBD_MD_FLEASIZE) {
+ int lmmsize;
+ struct lov_mds_md *lmm;
+
+ if (!S_ISREG(md->body->mode)) {
+ CDEBUG(D_INFO,
+ "OBD_MD_FLEASIZE set, should be a regular file, but is not\n");
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if (md->body->eadatasize == 0) {
+ CDEBUG(D_INFO,
+ "OBD_MD_FLEASIZE set, but eadatasize 0\n");
+ rc = -EPROTO;
+ goto out;
+ }
+ lmmsize = md->body->eadatasize;
+ lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize);
+ if (!lmm) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize);
+ if (rc < 0)
+ goto out;
+
+ if (rc < sizeof(*md->lsm)) {
+ CDEBUG(D_INFO,
+ "lsm size too small: rc < sizeof (*md->lsm) (%d < %d)\n",
+ rc, (int)sizeof(*md->lsm));
+ rc = -EPROTO;
+ goto out;
+ }
+
+ } else if (md->body->valid & OBD_MD_FLDIREA) {
+ int lmvsize;
+ struct lov_mds_md *lmv;
+
+ if (!S_ISDIR(md->body->mode)) {
+ CDEBUG(D_INFO,
+ "OBD_MD_FLDIREA set, should be a directory, but is not\n");
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if (md->body->eadatasize == 0) {
+ CDEBUG(D_INFO,
+ "OBD_MD_FLDIREA is set, but eadatasize 0\n");
+ return -EPROTO;
+ }
+ if (md->body->valid & OBD_MD_MEA) {
+ lmvsize = md->body->eadatasize;
+ lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+ lmvsize);
+ if (!lmv) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv,
+ lmvsize);
+ if (rc < 0)
+ goto out;
+
+ if (rc < sizeof(*md->mea)) {
+ CDEBUG(D_INFO,
+ "size too small: rc < sizeof(*md->mea) (%d < %d)\n",
+ rc, (int)sizeof(*md->mea));
+ rc = -EPROTO;
+ goto out;
+ }
+ }
+ }
+ rc = 0;
+
+ if (md->body->valid & OBD_MD_FLRMTPERM) {
+ /* remote permission */
+ LASSERT(client_is_remote(exp));
+ md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+ lustre_swab_mdt_remote_perm);
+ if (!md->remote_perm) {
+ rc = -EPROTO;
+ goto out;
+ }
+ } else if (md->body->valid & OBD_MD_FLACL) {
+ /* for ACL, it's possible that FLACL is set but aclsize is zero.
+ * only when aclsize != 0 there's an actual segment for ACL
+ * in reply buffer.
+ */
+ if (md->body->aclsize) {
+ rc = mdc_unpack_acl(req, md);
+ if (rc)
+ goto out;
+#ifdef CONFIG_FS_POSIX_ACL
+ } else {
+ md->posix_acl = NULL;
+#endif
+ }
+ }
+ if (md->body->valid & OBD_MD_FLMDSCAPA) {
+ struct obd_capa *oc = NULL;
+
+ rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc);
+ if (rc)
+ goto out;
+ md->mds_capa = oc;
+ }
+
+ if (md->body->valid & OBD_MD_FLOSSCAPA) {
+ struct obd_capa *oc = NULL;
+
+ rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc);
+ if (rc)
+ goto out;
+ md->oss_capa = oc;
+ }
+
+out:
+ if (rc) {
+ if (md->oss_capa) {
+ capa_put(md->oss_capa);
+ md->oss_capa = NULL;
+ }
+ if (md->mds_capa) {
+ capa_put(md->mds_capa);
+ md->mds_capa = NULL;
+ }
+#ifdef CONFIG_FS_POSIX_ACL
+ posix_acl_release(md->posix_acl);
+#endif
+ if (md->lsm)
+ obd_free_memmd(dt_exp, &md->lsm);
+ }
+ return rc;
+}
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+ return 0;
+}
+
+/**
+ * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING
+ * RPC chains.
+ */
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+ struct md_open_data *mod = req->rq_cb_data;
+ struct ptlrpc_request *close_req;
+ struct obd_client_handle *och;
+ struct lustre_handle old;
+ struct mdt_body *body;
+
+ if (mod == NULL) {
+ DEBUG_REQ(D_ERROR, req,
+ "Can't properly replay without open data.");
+ return;
+ }
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(body != NULL);
+
+ och = mod->mod_och;
+ if (och != NULL) {
+ struct lustre_handle *file_fh;
+
+ LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+ file_fh = &och->och_fh;
+ CDEBUG(D_HA, "updating handle from %#llx to %#llx\n",
+ file_fh->cookie, body->handle.cookie);
+ old = *file_fh;
+ *file_fh = body->handle;
+ }
+ close_req = mod->mod_close_req;
+ if (close_req != NULL) {
+ __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+ struct mdt_ioepoch *epoch;
+
+ LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING);
+ epoch = req_capsule_client_get(&close_req->rq_pill,
+ &RMF_MDT_EPOCH);
+ LASSERT(epoch);
+
+ if (och != NULL)
+ LASSERT(!memcmp(&old, &epoch->handle, sizeof(old)));
+ DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+ epoch->handle = body->handle;
+ }
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+ struct md_open_data *mod = req->rq_cb_data;
+
+ if (mod == NULL)
+ return;
+
+ /**
+ * No need to touch md_open_data::mod_och, it holds a reference on
+ * \var mod and will zero references to each other, \var mod will be
+ * freed after that when md_open_data::mod_och will put the reference.
+ */
+
+ /**
+ * Do not let open request to disappear as it still may be needed
+ * for close rpc to happen (it may happen on evict only, otherwise
+ * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+ * called), just mark this rpc as committed to distinguish these 2
+ * cases, see mdc_close() for details. The open request reference will
+ * be put along with freeing \var mod.
+ */
+ ptlrpc_request_addref(req);
+ spin_lock(&req->rq_lock);
+ req->rq_committed = 1;
+ spin_unlock(&req->rq_lock);
+ req->rq_cb_data = NULL;
+ obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och,
+ struct lookup_intent *it)
+{
+ struct md_open_data *mod;
+ struct mdt_rec_create *rec;
+ struct mdt_body *body;
+ struct ptlrpc_request *open_req = it->d.lustre.it_data;
+ struct obd_import *imp = open_req->rq_import;
+
+ if (!open_req->rq_replay)
+ return 0;
+
+ rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+ body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+ LASSERT(rec != NULL);
+ /* Incoming message in my byte order (it's been swabbed). */
+ /* Outgoing messages always in my byte order. */
+ LASSERT(body != NULL);
+
+ /* Only if the import is replayable, we set replay_open data */
+ if (och && imp->imp_replayable) {
+ mod = obd_mod_alloc();
+ if (mod == NULL) {
+ DEBUG_REQ(D_ERROR, open_req,
+ "Can't allocate md_open_data");
+ return 0;
+ }
+
+ /**
+ * Take a reference on \var mod, to be freed on mdc_close().
+ * It protects \var mod from being freed on eviction (commit
+ * callback is called despite rq_replay flag).
+ * Another reference for \var och.
+ */
+ obd_mod_get(mod);
+ obd_mod_get(mod);
+
+ spin_lock(&open_req->rq_lock);
+ och->och_mod = mod;
+ mod->mod_och = och;
+ mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) ||
+ it_disposition(it, DISP_OPEN_STRIPE);
+ mod->mod_open_req = open_req;
+ open_req->rq_cb_data = mod;
+ open_req->rq_commit_cb = mdc_commit_open;
+ spin_unlock(&open_req->rq_lock);
+ }
+
+ rec->cr_fid2 = body->fid1;
+ rec->cr_ioepoch = body->ioepoch;
+ rec->cr_old_handle.cookie = body->handle.cookie;
+ open_req->rq_replay_cb = mdc_replay_open;
+ if (!fid_is_sane(&body->fid1)) {
+ DEBUG_REQ(D_ERROR, open_req,
+ "Saving replay request with insane fid");
+ LBUG();
+ }
+
+ DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+ return 0;
+}
+
+static void mdc_free_open(struct md_open_data *mod)
+{
+ int committed = 0;
+
+ if (mod->mod_is_create == 0 &&
+ imp_connect_disp_stripe(mod->mod_open_req->rq_import))
+ committed = 1;
+
+ LASSERT(mod->mod_open_req->rq_replay == 0);
+
+ DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "free open request\n");
+
+ ptlrpc_request_committed(mod->mod_open_req, committed);
+ if (mod->mod_close_req)
+ ptlrpc_request_committed(mod->mod_close_req, committed);
+}
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+ struct obd_client_handle *och)
+{
+ struct md_open_data *mod = och->och_mod;
+
+ /**
+ * It is possible to not have \var mod in a case of eviction between
+ * lookup and ll_file_open().
+ **/
+ if (mod == NULL)
+ return 0;
+
+ LASSERT(mod != LP_POISON);
+ LASSERT(mod->mod_open_req != NULL);
+ mdc_free_open(mod);
+
+ mod->mod_och = NULL;
+ och->och_mod = NULL;
+ obd_mod_put(mod);
+
+ return 0;
+}
+
+/* Prepares the request for the replay by the given reply */
+static void mdc_close_handle_reply(struct ptlrpc_request *req,
+ struct md_op_data *op_data, int rc) {
+ struct mdt_body *repbody;
+ struct mdt_ioepoch *epoch;
+
+ if (req && rc == -EAGAIN) {
+ repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+
+ epoch->flags |= MF_SOM_AU;
+ if (repbody->valid & OBD_MD_FLGETATTRLOCK)
+ op_data->op_flags |= MF_GETATTR_LOCK;
+ }
+}
+
+static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+ struct md_open_data *mod, struct ptlrpc_request **request)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct ptlrpc_request *req;
+ struct req_format *req_fmt;
+ int rc;
+ int saved_rc = 0;
+
+
+ req_fmt = &RQF_MDS_CLOSE;
+ if (op_data->op_bias & MDS_HSM_RELEASE) {
+ req_fmt = &RQF_MDS_RELEASE_CLOSE;
+
+ /* allocate a FID for volatile file */
+ rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+ if (rc < 0) {
+ CERROR("%s: "DFID" failed to allocate FID: %d\n",
+ obd->obd_name, PFID(&op_data->op_fid1), rc);
+ /* save the errcode and proceed to close */
+ saved_rc = rc;
+ }
+ }
+
+ *request = NULL;
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+ * portal whose threads are not taking any DLM locks and are therefore
+ * always progressing */
+ req->rq_request_portal = MDS_READPAGE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
+ /* Ensure that this close's handle is fixed up during replay. */
+ if (likely(mod != NULL)) {
+ LASSERTF(mod->mod_open_req != NULL &&
+ mod->mod_open_req->rq_type != LI_POISON,
+ "POISONED open %p!\n", mod->mod_open_req);
+
+ mod->mod_close_req = req;
+
+ DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+ /* We no longer want to preserve this open for replay even
+ * though the open was committed. b=3632, b=3633 */
+ spin_lock(&mod->mod_open_req->rq_lock);
+ mod->mod_open_req->rq_replay = 0;
+ spin_unlock(&mod->mod_open_req->rq_lock);
+ } else {
+ CDEBUG(D_HA,
+ "couldn't find open req; expecting close error\n");
+ }
+
+ mdc_close_pack(req, op_data);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+ obd->u.cli.cl_default_mds_easize);
+ req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+ obd->u.cli.cl_default_mds_cookiesize);
+
+ ptlrpc_request_set_replen(req);
+
+ mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+ rc = ptlrpc_queue_wait(req);
+ mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+ if (req->rq_repmsg == NULL) {
+ CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req,
+ req->rq_status);
+ if (rc == 0)
+ rc = req->rq_status ?: -EIO;
+ } else if (rc == 0 || rc == -EAGAIN) {
+ struct mdt_body *body;
+
+ rc = lustre_msg_get_status(req->rq_repmsg);
+ if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+ DEBUG_REQ(D_ERROR, req,
+ "type == PTL_RPC_MSG_ERR, err = %d", rc);
+ if (rc > 0)
+ rc = -rc;
+ }
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL)
+ rc = -EPROTO;
+ } else if (rc == -ESTALE) {
+ /**
+ * it can be allowed error after 3633 if open was committed and
+ * server failed before close was sent. Let's check if mod
+ * exists and return no error in that case
+ */
+ if (mod) {
+ DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+ LASSERT(mod->mod_open_req != NULL);
+ if (mod->mod_open_req->rq_committed)
+ rc = 0;
+ }
+ }
+
+ if (mod) {
+ if (rc != 0)
+ mod->mod_close_req = NULL;
+ /* Since now, mod is accessed through open_req only,
+ * thus close req does not keep a reference on mod anymore. */
+ obd_mod_put(mod);
+ }
+ *request = req;
+ mdc_close_handle_reply(req, op_data, rc);
+ return rc < 0 ? rc : saved_rc;
+}
+
+static int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data,
+ struct md_open_data *mod)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_DONE_WRITING);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ if (mod != NULL) {
+ LASSERTF(mod->mod_open_req != NULL &&
+ mod->mod_open_req->rq_type != LI_POISON,
+ "POISONED setattr %p!\n", mod->mod_open_req);
+
+ mod->mod_close_req = req;
+ DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr");
+ /* We no longer want to preserve this setattr for replay even
+ * though the open was committed. b=3632, b=3633 */
+ spin_lock(&mod->mod_open_req->rq_lock);
+ mod->mod_open_req->rq_replay = 0;
+ spin_unlock(&mod->mod_open_req->rq_lock);
+ }
+
+ mdc_close_pack(req, op_data);
+ ptlrpc_request_set_replen(req);
+
+ mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+ rc = ptlrpc_queue_wait(req);
+ mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+ if (rc == -ESTALE) {
+ /**
+ * it can be allowed error after 3633 if open or setattr were
+ * committed and server failed before close was sent.
+ * Let's check if mod exists and return no error in that case
+ */
+ if (mod) {
+ LASSERT(mod->mod_open_req != NULL);
+ if (mod->mod_open_req->rq_committed)
+ rc = 0;
+ }
+ }
+
+ if (mod) {
+ if (rc != 0)
+ mod->mod_close_req = NULL;
+ LASSERT(mod->mod_open_req != NULL);
+ mdc_free_open(mod);
+
+ /* Since now, mod is accessed through setattr req only,
+ * thus DW req does not keep a reference on mod anymore. */
+ obd_mod_put(mod);
+ }
+
+ mdc_close_handle_reply(req, op_data, rc);
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+
+static int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data,
+ struct page **pages, struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ struct ptlrpc_bulk_desc *desc;
+ int i;
+ wait_queue_head_t waitq;
+ int resends = 0;
+ struct l_wait_info lwi;
+ int rc;
+
+ *request = NULL;
+ init_waitqueue_head(&waitq);
+
+restart_bulk:
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ req->rq_request_portal = MDS_READPAGE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
+ desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK,
+ MDS_BULK_PORTAL);
+ if (desc == NULL) {
+ ptlrpc_request_free(req);
+ return -ENOMEM;
+ }
+
+ /* NB req now owns desc and will free it when it gets freed */
+ for (i = 0; i < op_data->op_npages; i++)
+ ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+ mdc_readdir_pack(req, op_data->op_offset,
+ PAGE_CACHE_SIZE * op_data->op_npages,
+ &op_data->op_fid1, op_data->op_capa1);
+
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc) {
+ ptlrpc_req_finished(req);
+ if (rc != -ETIMEDOUT)
+ return rc;
+
+ resends++;
+ if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+ CERROR("too many resend retries, returning error\n");
+ return -EIO;
+ }
+ lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends),
+ NULL, NULL, NULL);
+ l_wait_event(waitq, 0, &lwi);
+
+ goto restart_bulk;
+ }
+
+ rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+ req->rq_bulk->bd_nob_transferred);
+ if (rc < 0) {
+ ptlrpc_req_finished(req);
+ return rc;
+ }
+
+ if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+ CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
+ req->rq_bulk->bd_nob_transferred,
+ PAGE_CACHE_SIZE * op_data->op_npages);
+ ptlrpc_req_finished(req);
+ return -EPROTO;
+ }
+
+ *request = req;
+ return 0;
+}
+
+static int mdc_statfs(const struct lu_env *env,
+ struct obd_export *exp, struct obd_statfs *osfs,
+ __u64 max_age, __u32 flags)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct ptlrpc_request *req;
+ struct obd_statfs *msfs;
+ struct obd_import *imp = NULL;
+ int rc;
+
+ /*
+ * Since the request might also come from lprocfs, so we need
+ * sync this with client_disconnect_export Bug15684
+ */
+ down_read(&obd->u.cli.cl_sem);
+ if (obd->u.cli.cl_import)
+ imp = class_import_get(obd->u.cli.cl_import);
+ up_read(&obd->u.cli.cl_sem);
+ if (!imp)
+ return -ENODEV;
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+ LUSTRE_MDS_VERSION, MDS_STATFS);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto output;
+ }
+
+ ptlrpc_request_set_replen(req);
+
+ if (flags & OBD_STATFS_NODELAY) {
+ /* procfs requests not want stay in wait for avoid deadlock */
+ req->rq_no_resend = 1;
+ req->rq_no_delay = 1;
+ }
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc) {
+ /* check connection error first */
+ if (imp->imp_connect_error)
+ rc = imp->imp_connect_error;
+ goto out;
+ }
+
+ msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+ if (msfs == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *osfs = *msfs;
+out:
+ ptlrpc_req_finished(req);
+output:
+ class_import_put(imp);
+ return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+ __u32 keylen, vallen;
+ void *key;
+ int rc;
+
+ if (gf->gf_pathlen > PATH_MAX)
+ return -ENAMETOOLONG;
+ if (gf->gf_pathlen < 2)
+ return -EOVERFLOW;
+
+ /* Key is KEY_FID2PATH + getinfo_fid2path description */
+ keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf);
+ OBD_ALLOC(key, keylen);
+ if (key == NULL)
+ return -ENOMEM;
+ memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+ memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+
+ CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n",
+ PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+ if (!fid_is_sane(&gf->gf_fid)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* Val is struct getinfo_fid2path result plus path */
+ vallen = sizeof(*gf) + gf->gf_pathlen;
+
+ rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL);
+ if (rc != 0 && rc != -EREMOTE)
+ goto out;
+
+ if (vallen <= sizeof(*gf)) {
+ rc = -EPROTO;
+ goto out;
+ } else if (vallen > sizeof(*gf) + gf->gf_pathlen) {
+ rc = -EOVERFLOW;
+ goto out;
+ }
+
+ CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n%s\n",
+ PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path);
+
+out:
+ OBD_FREE(key, keylen);
+ return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+ struct hsm_progress_kernel *hpk)
+{
+ struct obd_import *imp = class_exp2cliimp(exp);
+ struct hsm_progress_kernel *req_hpk;
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+ LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+ /* Copy hsm_progress struct */
+ req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+ if (req_hpk == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *req_hpk = *hpk;
+ req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_queue_wait(req);
+ goto out;
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+{
+ __u32 *archive_mask;
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+ LUSTRE_MDS_VERSION,
+ MDS_HSM_CT_REGISTER);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+ /* Copy hsm_progress struct */
+ archive_mask = req_capsule_client_get(&req->rq_pill,
+ &RMF_MDS_HSM_ARCHIVE);
+ if (archive_mask == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *archive_mask = archives;
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_queue_wait(req);
+ goto out;
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+ struct md_op_data *op_data)
+{
+ struct hsm_current_action *hca = op_data->op_data;
+ struct hsm_current_action *req_hca;
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_HSM_ACTION);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+ OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ req_hca = req_capsule_server_get(&req->rq_pill,
+ &RMF_MDS_HSM_CURRENT_ACTION);
+ if (req_hca == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *hca = *req_hca;
+
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+ LUSTRE_MDS_VERSION,
+ MDS_HSM_CT_UNREGISTER);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_queue_wait(req);
+ goto out;
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+ struct md_op_data *op_data)
+{
+ struct hsm_user_state *hus = op_data->op_data;
+ struct hsm_user_state *req_hus;
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_HSM_STATE_GET);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+ if (rc != 0) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+ OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+ if (req_hus == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *hus = *req_hus;
+
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+ struct md_op_data *op_data)
+{
+ struct hsm_state_set *hss = op_data->op_data;
+ struct hsm_state_set *req_hss;
+ struct ptlrpc_request *req;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_HSM_STATE_SET);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+ OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+ /* Copy states */
+ req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+ if (req_hss == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+ *req_hss = *hss;
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_queue_wait(req);
+ goto out;
+
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+ struct hsm_user_request *hur)
+{
+ struct obd_import *imp = class_exp2cliimp(exp);
+ struct ptlrpc_request *req;
+ struct hsm_request *req_hr;
+ struct hsm_user_item *req_hui;
+ char *req_opaque;
+ int rc;
+
+ req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+ hur->hur_request.hr_itemcount
+ * sizeof(struct hsm_user_item));
+ req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+ hur->hur_request.hr_data_len);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+ /* Copy hsm_request struct */
+ req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+ if (req_hr == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+ *req_hr = hur->hur_request;
+
+ /* Copy hsm_user_item structs */
+ req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+ if (req_hui == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+ memcpy(req_hui, hur->hur_user_item,
+ hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+ /* Copy opaque field */
+ req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+ if (req_opaque == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+ memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_queue_wait(req);
+ goto out;
+
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags)
+{
+ struct kuc_hdr *lh = (struct kuc_hdr *)buf;
+
+ LASSERT(len <= KUC_CHANGELOG_MSG_MAXSIZE);
+
+ lh->kuc_magic = KUC_MAGIC;
+ lh->kuc_transport = KUC_TRANSPORT_CHANGELOG;
+ lh->kuc_flags = flags;
+ lh->kuc_msgtype = CL_RECORD;
+ lh->kuc_msglen = len;
+ return lh;
+}
+
+#define D_CHANGELOG 0
+
+struct changelog_show {
+ __u64 cs_startrec;
+ __u32 cs_flags;
+ struct file *cs_fp;
+ char *cs_buf;
+ struct obd_device *cs_obd;
+};
+
+static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *hdr, void *data)
+{
+ struct changelog_show *cs = data;
+ struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
+ struct kuc_hdr *lh;
+ int len, rc;
+
+ if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+ rc = -EINVAL;
+ CERROR("%s: not a changelog rec %x/%d: rc = %d\n",
+ cs->cs_obd->obd_name, rec->cr_hdr.lrh_type,
+ rec->cr.cr_type, rc);
+ return rc;
+ }
+
+ if (rec->cr.cr_index < cs->cs_startrec) {
+ /* Skip entries earlier than what we are interested in */
+ CDEBUG(D_CHANGELOG, "rec=%llu start=%llu\n",
+ rec->cr.cr_index, cs->cs_startrec);
+ return 0;
+ }
+
+ CDEBUG(D_CHANGELOG, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID
+ " %.*s\n", rec->cr.cr_index, rec->cr.cr_type,
+ changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+ rec->cr.cr_flags & CLF_FLAGMASK,
+ PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+ rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+ len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+
+ /* Set up the message */
+ lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags);
+ memcpy(lh + 1, &rec->cr, len - sizeof(*lh));
+
+ rc = libcfs_kkuc_msg_put(cs->cs_fp, lh);
+ CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len, rc);
+
+ return rc;
+}
+
+static int mdc_changelog_send_thread(void *csdata)
+{
+ struct changelog_show *cs = csdata;
+ struct llog_ctxt *ctxt = NULL;
+ struct llog_handle *llh = NULL;
+ struct kuc_hdr *kuch;
+ int rc;
+
+ CDEBUG(D_CHANGELOG, "changelog to fp=%p start %llu\n",
+ cs->cs_fp, cs->cs_startrec);
+
+ OBD_ALLOC(cs->cs_buf, KUC_CHANGELOG_MSG_MAXSIZE);
+ if (cs->cs_buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* Set up the remote catalog handle */
+ ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT);
+ if (ctxt == NULL) {
+ rc = -ENOENT;
+ goto out;
+ }
+ rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG,
+ LLOG_OPEN_EXISTS);
+ if (rc) {
+ CERROR("%s: fail to open changelog catalog: rc = %d\n",
+ cs->cs_obd->obd_name, rc);
+ goto out;
+ }
+ rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL);
+ if (rc) {
+ CERROR("llog_init_handle failed %d\n", rc);
+ goto out;
+ }
+
+ rc = llog_cat_process(NULL, llh, changelog_kkuc_cb, cs, 0, 0);
+
+ /* Send EOF no matter what our result */
+ kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch), cs->cs_flags);
+ if (kuch) {
+ kuch->kuc_msgtype = CL_EOF;
+ libcfs_kkuc_msg_put(cs->cs_fp, kuch);
+ }
+
+out:
+ fput(cs->cs_fp);
+ if (llh)
+ llog_cat_close(NULL, llh);
+ if (ctxt)
+ llog_ctxt_put(ctxt);
+ if (cs->cs_buf)
+ OBD_FREE(cs->cs_buf, KUC_CHANGELOG_MSG_MAXSIZE);
+ OBD_FREE_PTR(cs);
+ return rc;
+}
+
+static int mdc_ioc_changelog_send(struct obd_device *obd,
+ struct ioc_changelog *icc)
+{
+ struct changelog_show *cs;
+ int rc;
+
+ /* Freed in mdc_changelog_send_thread */
+ OBD_ALLOC_PTR(cs);
+ if (!cs)
+ return -ENOMEM;
+
+ cs->cs_obd = obd;
+ cs->cs_startrec = icc->icc_recno;
+ /* matching fput in mdc_changelog_send_thread */
+ cs->cs_fp = fget(icc->icc_id);
+ cs->cs_flags = icc->icc_flags;
+
+ /*
+ * New thread because we should return to user app before
+ * writing into our pipe
+ */
+ rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs,
+ "mdc_clg_send_thread"));
+ if (!IS_ERR_VALUE(rc)) {
+ CDEBUG(D_CHANGELOG, "start changelog thread\n");
+ return 0;
+ }
+
+ CERROR("Failed to start changelog thread: %d\n", rc);
+ OBD_FREE_PTR(cs);
+ return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+ struct lustre_kernelcomm *lk);
+
+static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ struct ptlrpc_request *req;
+ struct obd_quotactl *body;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION,
+ MDS_QUOTACHECK);
+ if (req == NULL)
+ return -ENOMEM;
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+ *body = *oqctl;
+
+ ptlrpc_request_set_replen(req);
+
+ /* the next poll will find -ENODATA, that means quotacheck is
+ * going on */
+ cli->cl_qchk_stat = -ENODATA;
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ cli->cl_qchk_stat = rc;
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_quota_poll_check(struct obd_export *exp,
+ struct if_quotacheck *qchk)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ int rc;
+
+ qchk->obd_uuid = cli->cl_target_uuid;
+ memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME));
+
+ rc = cli->cl_qchk_stat;
+ /* the client is not the previous one */
+ if (rc == CL_NOT_QUOTACHECKED)
+ rc = -EINTR;
+ return rc;
+}
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct ptlrpc_request *req;
+ struct obd_quotactl *oqc;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION,
+ MDS_QUOTACTL);
+ if (req == NULL)
+ return -ENOMEM;
+
+ oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+ *oqc = *oqctl;
+
+ ptlrpc_request_set_replen(req);
+ ptlrpc_at_set_req_timeout(req);
+ req->rq_no_resend = 1;
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+ if (req->rq_repmsg) {
+ oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+ if (oqc) {
+ *oqctl = *oqc;
+ } else if (!rc) {
+ CERROR("Can't unpack obd_quotactl\n");
+ rc = -EPROTO;
+ }
+ } else if (!rc) {
+ CERROR("Can't unpack obd_quotactl\n");
+ rc = -EPROTO;
+ }
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+ struct md_op_data *op_data)
+{
+ LIST_HEAD(cancels);
+ struct ptlrpc_request *req;
+ int rc, count;
+ struct mdc_swap_layouts *msl, *payload;
+
+ msl = op_data->op_data;
+
+ /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+ * first thing it will do is to cancel the 2 layout
+ * locks hold by this client.
+ * So the client must cancel its layout locks on the 2 fids
+ * with the request RPC to avoid extra RPC round trips
+ */
+ count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+ LCK_CR, MDS_INODELOCK_LAYOUT);
+ count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+ LCK_CR, MDS_INODELOCK_LAYOUT);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_SWAP_LAYOUTS);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return -ENOMEM;
+ }
+
+ mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+ mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+
+ rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_swap_layouts_pack(req, op_data);
+
+ payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+ LASSERT(payload);
+
+ *payload = *msl;
+
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+ void *karg, void *uarg)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct obd_ioctl_data *data = karg;
+ struct obd_import *imp = obd->u.cli.cl_import;
+ int rc;
+
+ if (!try_module_get(THIS_MODULE)) {
+ CERROR("Can't get module. Is it alive?");
+ return -EINVAL;
+ }
+ switch (cmd) {
+ case OBD_IOC_CHANGELOG_SEND:
+ rc = mdc_ioc_changelog_send(obd, karg);
+ goto out;
+ case OBD_IOC_CHANGELOG_CLEAR: {
+ struct ioc_changelog *icc = karg;
+ struct changelog_setinfo cs = {
+ .cs_recno = icc->icc_recno,
+ .cs_id = icc->icc_id
+ };
+
+ rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR),
+ KEY_CHANGELOG_CLEAR, sizeof(cs), &cs,
+ NULL);
+ goto out;
+ }
+ case OBD_IOC_FID2PATH:
+ rc = mdc_ioc_fid2path(exp, karg);
+ goto out;
+ case LL_IOC_HSM_CT_START:
+ rc = mdc_ioc_hsm_ct_start(exp, karg);
+ /* ignore if it was already registered on this MDS. */
+ if (rc == -EEXIST)
+ rc = 0;
+ goto out;
+ case LL_IOC_HSM_PROGRESS:
+ rc = mdc_ioc_hsm_progress(exp, karg);
+ goto out;
+ case LL_IOC_HSM_STATE_GET:
+ rc = mdc_ioc_hsm_state_get(exp, karg);
+ goto out;
+ case LL_IOC_HSM_STATE_SET:
+ rc = mdc_ioc_hsm_state_set(exp, karg);
+ goto out;
+ case LL_IOC_HSM_ACTION:
+ rc = mdc_ioc_hsm_current_action(exp, karg);
+ goto out;
+ case LL_IOC_HSM_REQUEST:
+ rc = mdc_ioc_hsm_request(exp, karg);
+ goto out;
+ case OBD_IOC_CLIENT_RECOVER:
+ rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+ if (rc < 0)
+ goto out;
+ rc = 0;
+ goto out;
+ case IOC_OSC_SET_ACTIVE:
+ rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+ goto out;
+ case OBD_IOC_POLL_QUOTACHECK:
+ rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+ goto out;
+ case OBD_IOC_PING_TARGET:
+ rc = ptlrpc_obd_ping(obd);
+ goto out;
+ /*
+ * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+ * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+ * there'd be no LMV layer thus we might be called here. Eventually
+ * this code should be removed.
+ * bz20731, LU-592.
+ */
+ case IOC_OBD_STATFS: {
+ struct obd_statfs stat_buf = {0};
+
+ if (*((__u32 *) data->ioc_inlbuf2) != 0) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ /* copy UUID */
+ if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+ min_t(size_t, data->ioc_plen2,
+ sizeof(struct obd_uuid)))) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ 0);
+ if (rc != 0)
+ goto out;
+
+ if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+ min_t(size_t, data->ioc_plen1,
+ sizeof(stat_buf)))) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ rc = 0;
+ goto out;
+ }
+ case OBD_IOC_QUOTACTL: {
+ struct if_quotactl *qctl = karg;
+ struct obd_quotactl *oqctl;
+
+ OBD_ALLOC_PTR(oqctl);
+ if (oqctl == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ QCTL_COPY(oqctl, qctl);
+ rc = obd_quotactl(exp, oqctl);
+ if (rc == 0) {
+ QCTL_COPY(qctl, oqctl);
+ qctl->qc_valid = QC_MDTIDX;
+ qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+ }
+
+ OBD_FREE_PTR(oqctl);
+ goto out;
+ }
+ case LL_IOC_GET_CONNECT_FLAGS:
+ if (copy_to_user(uarg, exp_connect_flags_ptr(exp),
+ sizeof(*exp_connect_flags_ptr(exp)))) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ rc = 0;
+ goto out;
+ case LL_IOC_LOV_SWAP_LAYOUTS:
+ rc = mdc_ioc_swap_layouts(exp, karg);
+ goto out;
+ default:
+ CERROR("unrecognised ioctl: cmd = %#x\n", cmd);
+ rc = -ENOTTY;
+ goto out;
+ }
+out:
+ module_put(THIS_MODULE);
+
+ return rc;
+}
+
+static int mdc_get_info_rpc(struct obd_export *exp,
+ u32 keylen, void *key,
+ int vallen, void *val)
+{
+ struct obd_import *imp = class_exp2cliimp(exp);
+ struct ptlrpc_request *req;
+ char *tmp;
+ int rc = -EINVAL;
+
+ req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+ RCL_CLIENT, keylen);
+ req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+ RCL_CLIENT, sizeof(__u32));
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+ memcpy(tmp, key, keylen);
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+ memcpy(tmp, &vallen, sizeof(__u32));
+
+ req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+ RCL_SERVER, vallen);
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ /* -EREMOTE means the get_info result is partial, and it needs to
+ * continue on another MDT, see fid2path part in lmv_iocontrol */
+ if (rc == 0 || rc == -EREMOTE) {
+ tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+ memcpy(val, tmp, vallen);
+ if (ptlrpc_rep_need_swab(req)) {
+ if (KEY_IS(KEY_FID2PATH))
+ lustre_swab_fid2path(val);
+ }
+ }
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+ __swab32s(&h->hai_len);
+ __swab32s(&h->hai_action);
+ lustre_swab_lu_fid(&h->hai_fid);
+ lustre_swab_lu_fid(&h->hai_dfid);
+ __swab64s(&h->hai_cookie);
+ __swab64s(&h->hai_extent.offset);
+ __swab64s(&h->hai_extent.length);
+ __swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+ struct hsm_action_item *hai;
+ int i;
+
+ __swab32s(&h->hal_version);
+ __swab32s(&h->hal_count);
+ __swab32s(&h->hal_archive_id);
+ __swab64s(&h->hal_flags);
+ hai = hai_zero(h);
+ for (i = 0; i < h->hal_count; i++, hai = hai_next(hai))
+ lustre_swab_hai(hai);
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+ __swab16s(&l->kuc_magic);
+ /* __u8 l->kuc_transport */
+ __swab16s(&l->kuc_msgtype);
+ __swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+ struct lustre_kernelcomm *lk)
+{
+ struct obd_import *imp = class_exp2cliimp(exp);
+ __u32 archive = lk->lk_data;
+ int rc = 0;
+
+ if (lk->lk_group != KUC_GRP_HSM) {
+ CERROR("Bad copytool group %d\n", lk->lk_group);
+ return -EINVAL;
+ }
+
+ CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+ lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+ if (lk->lk_flags & LK_FLG_STOP) {
+ /* Unregister with the coordinator */
+ rc = mdc_ioc_hsm_ct_unregister(imp);
+ } else {
+ rc = mdc_ioc_hsm_ct_register(imp, archive);
+ }
+
+ return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(int len, void *val)
+{
+ struct kuc_hdr *lh = (struct kuc_hdr *)val;
+ struct hsm_action_list *hal = (struct hsm_action_list *)(lh + 1);
+ int rc;
+
+ if (len < sizeof(*lh) + sizeof(*hal)) {
+ CERROR("Short HSM message %d < %d\n", len,
+ (int) (sizeof(*lh) + sizeof(*hal)));
+ return -EPROTO;
+ }
+ if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+ lustre_swab_kuch(lh);
+ lustre_swab_hal(hal);
+ } else if (lh->kuc_magic != KUC_MAGIC) {
+ CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+ return -EPROTO;
+ }
+
+ CDEBUG(D_HSM,
+ "Received message mg=%x t=%d m=%d l=%d actions=%d on %s\n",
+ lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+ lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+ /* Broadcast to HSM listeners */
+ rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+
+ return rc;
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data archive id served by the copytool
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg)
+{
+ struct obd_import *imp = (struct obd_import *)cb_arg;
+ __u32 archive = data;
+ int rc;
+
+ CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n",
+ archive);
+ rc = mdc_ioc_hsm_ct_register(imp, archive);
+
+ /* ignore error if the copytool is already registered */
+ return ((rc != 0) && (rc != -EEXIST)) ? rc : 0;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+ /* re-register HSM agents */
+ return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister,
+ (void *)imp);
+}
+
+static int mdc_set_info_async(const struct lu_env *env,
+ struct obd_export *exp,
+ u32 keylen, void *key,
+ u32 vallen, void *val,
+ struct ptlrpc_request_set *set)
+{
+ struct obd_import *imp = class_exp2cliimp(exp);
+ int rc;
+
+ if (KEY_IS(KEY_READ_ONLY)) {
+ if (vallen != sizeof(int))
+ return -EINVAL;
+
+ spin_lock(&imp->imp_lock);
+ if (*((int *)val)) {
+ imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+ imp->imp_connect_data.ocd_connect_flags |=
+ OBD_CONNECT_RDONLY;
+ } else {
+ imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+ imp->imp_connect_data.ocd_connect_flags &=
+ ~OBD_CONNECT_RDONLY;
+ }
+ spin_unlock(&imp->imp_lock);
+
+ rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+ keylen, key, vallen, val, set);
+ return rc;
+ }
+ if (KEY_IS(KEY_SPTLRPC_CONF)) {
+ sptlrpc_conf_client_adapt(exp->exp_obd);
+ return 0;
+ }
+ if (KEY_IS(KEY_FLUSH_CTX)) {
+ sptlrpc_import_flush_my_ctx(imp);
+ return 0;
+ }
+ if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+ rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+ keylen, key, vallen, val, set);
+ return rc;
+ }
+ if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+ rc = mdc_hsm_copytool_send(vallen, val);
+ return rc;
+ }
+
+ CERROR("Unknown key %s\n", (char *)key);
+ return -EINVAL;
+}
+
+static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+ __u32 keylen, void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm)
+{
+ int rc = -EINVAL;
+
+ if (KEY_IS(KEY_MAX_EASIZE)) {
+ int mdsize, *max_easize;
+
+ if (*vallen != sizeof(int))
+ return -EINVAL;
+ mdsize = *(int *)val;
+ if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+ exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+ max_easize = val;
+ *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+ return 0;
+ } else if (KEY_IS(KEY_DEFAULT_EASIZE)) {
+ int *default_easize;
+
+ if (*vallen != sizeof(int))
+ return -EINVAL;
+ default_easize = val;
+ *default_easize = exp->exp_obd->u.cli.cl_default_mds_easize;
+ return 0;
+ } else if (KEY_IS(KEY_MAX_COOKIESIZE)) {
+ int mdsize, *max_cookiesize;
+
+ if (*vallen != sizeof(int))
+ return -EINVAL;
+ mdsize = *(int *)val;
+ if (mdsize > exp->exp_obd->u.cli.cl_max_mds_cookiesize)
+ exp->exp_obd->u.cli.cl_max_mds_cookiesize = mdsize;
+ max_cookiesize = val;
+ *max_cookiesize = exp->exp_obd->u.cli.cl_max_mds_cookiesize;
+ return 0;
+ } else if (KEY_IS(KEY_DEFAULT_COOKIESIZE)) {
+ int *default_cookiesize;
+
+ if (*vallen != sizeof(int))
+ return -EINVAL;
+ default_cookiesize = val;
+ *default_cookiesize =
+ exp->exp_obd->u.cli.cl_default_mds_cookiesize;
+ return 0;
+ } else if (KEY_IS(KEY_CONN_DATA)) {
+ struct obd_import *imp = class_exp2cliimp(exp);
+ struct obd_connect_data *data = val;
+
+ if (*vallen != sizeof(*data))
+ return -EINVAL;
+
+ *data = imp->imp_connect_data;
+ return 0;
+ } else if (KEY_IS(KEY_TGT_COUNT)) {
+ *((int *)val) = 1;
+ return 0;
+ }
+
+ rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+ return rc;
+}
+
+static int mdc_sync(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ *request = NULL;
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, fid, oc, 0, 0, -1, 0);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ ptlrpc_req_finished(req);
+ else
+ *request = req;
+ return rc;
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+ enum obd_import_event event)
+{
+ int rc = 0;
+
+ LASSERT(imp->imp_obd == obd);
+
+ switch (event) {
+ case IMP_EVENT_DISCON: {
+#if 0
+ /* XXX Pass event up to OBDs stack. used only for FLD now */
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL);
+#endif
+ break;
+ }
+ case IMP_EVENT_INACTIVE: {
+ struct client_obd *cli = &obd->u.cli;
+ /*
+ * Flush current sequence to make client obtain new one
+ * from server in case of disconnect/reconnect.
+ */
+ if (cli->cl_seq != NULL)
+ seq_client_flush(cli->cl_seq);
+
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+ break;
+ }
+ case IMP_EVENT_INVALIDATE: {
+ struct ldlm_namespace *ns = obd->obd_namespace;
+
+ ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+ break;
+ }
+ case IMP_EVENT_ACTIVE:
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+ /* redo the kuc registration after reconnecting */
+ if (rc == 0)
+ rc = mdc_kuc_reregister(imp);
+ break;
+ case IMP_EVENT_OCD:
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+ break;
+ case IMP_EVENT_DEACTIVATE:
+ case IMP_EVENT_ACTIVATE:
+ break;
+ default:
+ CERROR("Unknown import event %x\n", event);
+ LBUG();
+ }
+ return rc;
+}
+
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+ struct md_op_data *op_data)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ struct lu_client_seq *seq = cli->cl_seq;
+
+ return seq_client_alloc_fid(NULL, seq, fid);
+}
+
+static struct obd_uuid *mdc_get_uuid(struct obd_export *exp)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+
+ return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+ if (lock->l_resource->lr_type != LDLM_IBITS)
+ return 0;
+
+ /* FIXME: if we ever get into a situation where there are too many
+ * opened files with open locks on a single node, then we really
+ * should replay these open locks to reget it */
+ if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+ return 0;
+
+ return 1;
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+ if (res->lr_lvb_inode)
+ res->lr_lvb_inode = NULL;
+
+ return 0;
+}
+
+static struct ldlm_valblock_ops inode_lvbo = {
+ .lvbo_free = mdc_resource_inode_free,
+};
+
+static int mdc_llog_init(struct obd_device *obd)
+{
+ struct obd_llog_group *olg = &obd->obd_olg;
+ struct llog_ctxt *ctxt;
+ int rc;
+
+ rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd,
+ &llog_client_ops);
+ if (rc)
+ return rc;
+
+ ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+ llog_initiator_connect(ctxt);
+ llog_ctxt_put(ctxt);
+
+ return 0;
+}
+
+static void mdc_llog_finish(struct obd_device *obd)
+{
+ struct llog_ctxt *ctxt;
+
+ ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+ if (ctxt)
+ llog_cleanup(NULL, ctxt);
+}
+
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+ struct client_obd *cli = &obd->u.cli;
+ struct lprocfs_static_vars lvars = { NULL };
+ int rc;
+
+ OBD_ALLOC(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock));
+ if (!cli->cl_rpc_lock)
+ return -ENOMEM;
+ mdc_init_rpc_lock(cli->cl_rpc_lock);
+
+ ptlrpcd_addref();
+
+ OBD_ALLOC(cli->cl_close_lock, sizeof(*cli->cl_close_lock));
+ if (!cli->cl_close_lock) {
+ rc = -ENOMEM;
+ goto err_rpc_lock;
+ }
+ mdc_init_rpc_lock(cli->cl_close_lock);
+
+ rc = client_obd_setup(obd, cfg);
+ if (rc)
+ goto err_close_lock;
+ lprocfs_mdc_init_vars(&lvars);
+ lprocfs_obd_setup(obd, lvars.obd_vars);
+ sptlrpc_lprocfs_cliobd_attach(obd);
+ ptlrpc_lprocfs_register_obd(obd);
+
+ ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery);
+
+ obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+ rc = mdc_llog_init(obd);
+ if (rc) {
+ mdc_cleanup(obd);
+ CERROR("failed to setup llogging subsystems\n");
+ }
+
+ return rc;
+
+err_close_lock:
+ OBD_FREE(cli->cl_close_lock, sizeof(*cli->cl_close_lock));
+err_rpc_lock:
+ OBD_FREE(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock));
+ ptlrpcd_decref();
+ return rc;
+}
+
+/* Initialize the default and maximum LOV EA and cookie sizes. This allows
+ * us to make MDS RPCs with large enough reply buffers to hold a default
+ * sized EA and cookie without having to calculate this (via a call into the
+ * LOV + OSCs) each time we make an RPC. The maximum size is also tracked
+ * but not used to avoid wastefully vmalloc()'ing large reply buffers when
+ * a large number of stripes is possible. If a larger reply buffer is
+ * required it will be reallocated in the ptlrpc layer due to overflow.
+ */
+static int mdc_init_ea_size(struct obd_export *exp, int easize,
+ int def_easize, int cookiesize, int def_cookiesize)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct client_obd *cli = &obd->u.cli;
+
+ if (cli->cl_max_mds_easize < easize)
+ cli->cl_max_mds_easize = easize;
+
+ if (cli->cl_default_mds_easize < def_easize)
+ cli->cl_default_mds_easize = def_easize;
+
+ if (cli->cl_max_mds_cookiesize < cookiesize)
+ cli->cl_max_mds_cookiesize = cookiesize;
+
+ if (cli->cl_default_mds_cookiesize < def_cookiesize)
+ cli->cl_default_mds_cookiesize = def_cookiesize;
+
+ return 0;
+}
+
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+ switch (stage) {
+ case OBD_CLEANUP_EARLY:
+ break;
+ case OBD_CLEANUP_EXPORTS:
+ /* Failsafe, ok if racy */
+ if (obd->obd_type->typ_refcnt <= 1)
+ libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
+
+ obd_cleanup_client_import(obd);
+ ptlrpc_lprocfs_unregister_obd(obd);
+ lprocfs_obd_cleanup(obd);
+
+ mdc_llog_finish(obd);
+ break;
+ }
+ return 0;
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+ struct client_obd *cli = &obd->u.cli;
+
+ OBD_FREE(cli->cl_rpc_lock, sizeof(*cli->cl_rpc_lock));
+ OBD_FREE(cli->cl_close_lock, sizeof(*cli->cl_close_lock));
+
+ ptlrpcd_decref();
+
+ return client_obd_cleanup(obd);
+}
+
+static int mdc_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+ struct lustre_cfg *lcfg = buf;
+ struct lprocfs_static_vars lvars = { NULL };
+ int rc = 0;
+
+ lprocfs_mdc_init_vars(&lvars);
+ switch (lcfg->lcfg_command) {
+ default:
+ rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars,
+ lcfg, obd);
+ if (rc > 0)
+ rc = 0;
+ break;
+ }
+ return rc;
+}
+
+
+/* get remote permission for current user on fid */
+static int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid,
+ struct obd_capa *oc, __u32 suppgid,
+ struct ptlrpc_request **request)
+{
+ struct ptlrpc_request *req;
+ int rc;
+
+ LASSERT(client_is_remote(exp));
+
+ *request = NULL;
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0);
+
+ req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+ sizeof(struct mdt_remote_perm));
+
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ ptlrpc_req_finished(req);
+ else
+ *request = req;
+ return rc;
+}
+
+static int mdc_interpret_renew_capa(const struct lu_env *env,
+ struct ptlrpc_request *req, void *args,
+ int status)
+{
+ struct mdc_renew_capa_args *ra = args;
+ struct mdt_body *body = NULL;
+ struct lustre_capa *capa;
+
+ if (status) {
+ capa = ERR_PTR(status);
+ goto out;
+ }
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+ if (body == NULL) {
+ capa = ERR_PTR(-EFAULT);
+ goto out;
+ }
+
+ if ((body->valid & OBD_MD_FLOSSCAPA) == 0) {
+ capa = ERR_PTR(-ENOENT);
+ goto out;
+ }
+
+ capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2);
+ if (!capa) {
+ capa = ERR_PTR(-EFAULT);
+ goto out;
+ }
+out:
+ ra->ra_cb(ra->ra_oc, capa);
+ return 0;
+}
+
+static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+ renew_capa_cb_t cb)
+{
+ struct ptlrpc_request *req;
+ struct mdc_renew_capa_args *ra;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR,
+ LUSTRE_MDS_VERSION, MDS_GETATTR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ /* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the
+ * capa to renew is oss capa.
+ */
+ mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0);
+ ptlrpc_request_set_replen(req);
+
+ CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args));
+ ra = ptlrpc_req_async_args(req);
+ ra->ra_oc = oc;
+ ra->ra_cb = cb;
+ req->rq_interpret_reply = mdc_interpret_renew_capa;
+ ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+ return 0;
+}
+
+static struct obd_ops mdc_obd_ops = {
+ .o_owner = THIS_MODULE,
+ .o_setup = mdc_setup,
+ .o_precleanup = mdc_precleanup,
+ .o_cleanup = mdc_cleanup,
+ .o_add_conn = client_import_add_conn,
+ .o_del_conn = client_import_del_conn,
+ .o_connect = client_connect_import,
+ .o_disconnect = client_disconnect_export,
+ .o_iocontrol = mdc_iocontrol,
+ .o_set_info_async = mdc_set_info_async,
+ .o_statfs = mdc_statfs,
+ .o_fid_init = client_fid_init,
+ .o_fid_fini = client_fid_fini,
+ .o_fid_alloc = mdc_fid_alloc,
+ .o_import_event = mdc_import_event,
+ .o_get_info = mdc_get_info,
+ .o_process_config = mdc_process_config,
+ .o_get_uuid = mdc_get_uuid,
+ .o_quotactl = mdc_quotactl,
+ .o_quotacheck = mdc_quotacheck
+};
+
+static struct md_ops mdc_md_ops = {
+ .m_getstatus = mdc_getstatus,
+ .m_null_inode = mdc_null_inode,
+ .m_find_cbdata = mdc_find_cbdata,
+ .m_close = mdc_close,
+ .m_create = mdc_create,
+ .m_done_writing = mdc_done_writing,
+ .m_enqueue = mdc_enqueue,
+ .m_getattr = mdc_getattr,
+ .m_getattr_name = mdc_getattr_name,
+ .m_intent_lock = mdc_intent_lock,
+ .m_link = mdc_link,
+ .m_is_subdir = mdc_is_subdir,
+ .m_rename = mdc_rename,
+ .m_setattr = mdc_setattr,
+ .m_setxattr = mdc_setxattr,
+ .m_getxattr = mdc_getxattr,
+ .m_sync = mdc_sync,
+ .m_readpage = mdc_readpage,
+ .m_unlink = mdc_unlink,
+ .m_cancel_unused = mdc_cancel_unused,
+ .m_init_ea_size = mdc_init_ea_size,
+ .m_set_lock_data = mdc_set_lock_data,
+ .m_lock_match = mdc_lock_match,
+ .m_get_lustre_md = mdc_get_lustre_md,
+ .m_free_lustre_md = mdc_free_lustre_md,
+ .m_set_open_replay_data = mdc_set_open_replay_data,
+ .m_clear_open_replay_data = mdc_clear_open_replay_data,
+ .m_renew_capa = mdc_renew_capa,
+ .m_unpack_capa = mdc_unpack_capa,
+ .m_get_remote_perm = mdc_get_remote_perm,
+ .m_intent_getattr_async = mdc_intent_getattr_async,
+ .m_revalidate_lock = mdc_revalidate_lock
+};
+
+static int __init mdc_init(void)
+{
+ struct lprocfs_static_vars lvars = { NULL };
+
+ lprocfs_mdc_init_vars(&lvars);
+
+ return class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars,
+ LUSTRE_MDC_NAME, NULL);
+}
+
+static void /*__exit*/ mdc_exit(void)
+{
+ class_unregister_type(LUSTRE_MDC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);
diff --git a/drivers/staging/lustre/lustre/mgc/Makefile b/drivers/staging/lustre/lustre/mgc/Makefile
new file mode 100644
index 000000000..cc6e9f51a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LUSTRE_FS) += mgc.o
+mgc-y := mgc_request.o
+mgc-$(CONFIG_PROC_FS) += lproc_mgc.o
diff --git a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
new file mode 100644
index 000000000..c4ea38e5f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
@@ -0,0 +1,80 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/vfs.h>
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "mgc_internal.h"
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(mgc, ping);
+
+static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
+{
+ return lprocfs_mgc_rd_ir_state(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mgc_ir_state);
+
+static struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+ { "uuid", &mgc_uuid_fops, NULL, 0 },
+ { "ping", &mgc_ping_fops, NULL, 0222 },
+ { "connect_flags", &mgc_connect_flags_fops, NULL, 0 },
+ { "mgs_server_uuid", &mgc_server_uuid_fops, NULL, 0 },
+ { "mgs_conn_uuid", &mgc_conn_uuid_fops, NULL, 0 },
+ { "import", &mgc_import_fops, NULL, 0 },
+ { "state", &mgc_state_fops, NULL, 0 },
+ { "ir_state", &mgc_ir_state_fops, NULL, 0 },
+ { NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, numrefs);
+static struct lprocfs_vars lprocfs_mgc_module_vars[] = {
+ { "num_refs", &mgc_numrefs_fops, NULL, 0 },
+ { NULL }
+};
+
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = lprocfs_mgc_module_vars;
+ lvars->obd_vars = lprocfs_mgc_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/drivers/staging/lustre/lustre/mgc/mgc_internal.h
new file mode 100644
index 000000000..a6f8b3ced
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/mgc_internal.h
@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_export.h"
+
+#if defined (CONFIG_PROC_FS)
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars);
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
+#else
+static inline void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+static inline int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+{
+ return cld->cld_type == CONFIG_T_SPTLRPC;
+}
+
+static inline int cld_is_recover(struct config_llog_data *cld)
+{
+ return cld->cld_type == CONFIG_T_RECOVER;
+}
+
+#endif /* _MGC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c
new file mode 100644
index 000000000..7947aec5c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c
@@ -0,0 +1,1762 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+#include <linux/module.h>
+#include "../include/obd_class.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_disk.h"
+
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+ int type)
+{
+ __u64 resname = 0;
+
+ if (len > sizeof(resname)) {
+ CERROR("name too long: %s\n", name);
+ return -EINVAL;
+ }
+ if (len <= 0) {
+ CERROR("missing name: %s\n", name);
+ return -EINVAL;
+ }
+ memcpy(&resname, name, len);
+
+ /* Always use the same endianness for the resid */
+ memset(res_id, 0, sizeof(*res_id));
+ res_id->name[0] = cpu_to_le64(resname);
+ /* XXX: unfortunately, sptlprc and config llog share one lock */
+ switch (type) {
+ case CONFIG_T_CONFIG:
+ case CONFIG_T_SPTLRPC:
+ resname = 0;
+ break;
+ case CONFIG_T_RECOVER:
+ case CONFIG_T_PARAMS:
+ resname = type;
+ break;
+ default:
+ LBUG();
+ }
+ res_id->name[1] = cpu_to_le64(resname);
+ CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name,
+ res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+ return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+{
+ /* fsname is at most 8 chars long, maybe contain "-".
+ * e.g. "lustre", "SUN-000" */
+ return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+static int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+{
+ char *name_end;
+ int len;
+
+ /* logname consists of "fsname-nodetype".
+ * e.g. "lustre-MDT0001", "SUN-000-client"
+ * there is an exception: llog "params" */
+ name_end = strrchr(logname, '-');
+ if (!name_end)
+ len = strlen(logname);
+ else
+ len = name_end - logname;
+ return mgc_name2resid(logname, len, res_id, type);
+}
+
+/********************** config llog list **********************/
+static LIST_HEAD(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+ atomic_inc(&cld->cld_refcount);
+ CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+ atomic_read(&cld->cld_refcount));
+ return 0;
+}
+
+/* Drop a reference to a config log. When no longer referenced,
+ we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+ CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+ atomic_read(&cld->cld_refcount));
+ LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+ /* spinlock to make sure no item with 0 refcount in the list */
+ if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+ list_del(&cld->cld_list_chain);
+ spin_unlock(&config_list_lock);
+
+ CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+ if (cld->cld_recover)
+ config_log_put(cld->cld_recover);
+ if (cld->cld_sptlrpc)
+ config_log_put(cld->cld_sptlrpc);
+ if (cld->cld_params)
+ config_log_put(cld->cld_params);
+ if (cld_is_sptlrpc(cld))
+ sptlrpc_conf_log_stop(cld->cld_logname);
+
+ class_export_put(cld->cld_mgcexp);
+ OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+ }
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+ struct config_llog_instance *cfg)
+{
+ struct config_llog_data *cld;
+ struct config_llog_data *found = NULL;
+ void *instance;
+
+ LASSERT(logname != NULL);
+
+ instance = cfg ? cfg->cfg_instance : NULL;
+ spin_lock(&config_list_lock);
+ list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+ /* check if instance equals */
+ if (instance != cld->cld_cfg.cfg_instance)
+ continue;
+
+ /* instance may be NULL, should check name */
+ if (strcmp(logname, cld->cld_logname) == 0) {
+ found = cld;
+ break;
+ }
+ }
+ if (found) {
+ atomic_inc(&found->cld_refcount);
+ LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0);
+ }
+ spin_unlock(&config_list_lock);
+ return found;
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+ char *logname,
+ int type,
+ struct config_llog_instance *cfg,
+ struct super_block *sb)
+{
+ struct config_llog_data *cld;
+ int rc;
+
+ CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+ cfg ? cfg->cfg_instance : NULL);
+
+ OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+ if (!cld)
+ return ERR_PTR(-ENOMEM);
+
+ strcpy(cld->cld_logname, logname);
+ if (cfg)
+ cld->cld_cfg = *cfg;
+ else
+ cld->cld_cfg.cfg_callback = class_config_llog_handler;
+ mutex_init(&cld->cld_lock);
+ cld->cld_cfg.cfg_last_idx = 0;
+ cld->cld_cfg.cfg_flags = 0;
+ cld->cld_cfg.cfg_sb = sb;
+ cld->cld_type = type;
+ atomic_set(&cld->cld_refcount, 1);
+
+ /* Keep the mgc around until we are done */
+ cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+ if (cld_is_sptlrpc(cld)) {
+ sptlrpc_conf_log_start(logname);
+ cld->cld_cfg.cfg_obdname = obd->obd_name;
+ }
+
+ rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+
+ spin_lock(&config_list_lock);
+ list_add(&cld->cld_list_chain, &config_llog_list);
+ spin_unlock(&config_list_lock);
+
+ if (rc) {
+ config_log_put(cld);
+ return ERR_PTR(rc);
+ }
+
+ if (cld_is_sptlrpc(cld)) {
+ rc = mgc_process_log(obd, cld);
+ if (rc && rc != -ENOENT)
+ CERROR("failed processing sptlrpc log: %d\n", rc);
+ }
+
+ return cld;
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+ char *fsname,
+ struct config_llog_instance *cfg,
+ struct super_block *sb)
+{
+ struct config_llog_instance lcfg = *cfg;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct config_llog_data *cld;
+ char logname[32];
+
+ if (IS_OST(lsi))
+ return NULL;
+
+ /* for osp-on-ost, see lustre_start_osp() */
+ if (IS_MDT(lsi) && lcfg.cfg_instance)
+ return NULL;
+
+ /* we have to use different llog for clients and mdts for cmd
+ * where only clients are notified if one of cmd server restarts */
+ LASSERT(strlen(fsname) < sizeof(logname) / 2);
+ strcpy(logname, fsname);
+ if (IS_SERVER(lsi)) { /* mdt */
+ LASSERT(lcfg.cfg_instance == NULL);
+ lcfg.cfg_instance = sb;
+ strcat(logname, "-mdtir");
+ } else {
+ LASSERT(lcfg.cfg_instance != NULL);
+ strcat(logname, "-cliir");
+ }
+
+ cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+ return cld;
+}
+
+static struct config_llog_data *config_params_log_add(struct obd_device *obd,
+ struct config_llog_instance *cfg, struct super_block *sb)
+{
+ struct config_llog_instance lcfg = *cfg;
+ struct config_llog_data *cld;
+
+ lcfg.cfg_instance = sb;
+
+ cld = do_config_log_add(obd, PARAMS_FILENAME, CONFIG_T_PARAMS,
+ &lcfg, sb);
+
+ return cld;
+}
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static int config_log_add(struct obd_device *obd, char *logname,
+ struct config_llog_instance *cfg,
+ struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct config_llog_data *cld;
+ struct config_llog_data *sptlrpc_cld;
+ struct config_llog_data *params_cld;
+ char seclogname[32];
+ char *ptr;
+ int rc;
+
+ CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+
+ /*
+ * for each regular log, the depended sptlrpc log name is
+ * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+ */
+ ptr = strrchr(logname, '-');
+ if (ptr == NULL || ptr - logname > 8) {
+ CERROR("logname %s is too long\n", logname);
+ return -EINVAL;
+ }
+
+ memcpy(seclogname, logname, ptr - logname);
+ strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+ sptlrpc_cld = config_log_find(seclogname, NULL);
+ if (sptlrpc_cld == NULL) {
+ sptlrpc_cld = do_config_log_add(obd, seclogname,
+ CONFIG_T_SPTLRPC, NULL, NULL);
+ if (IS_ERR(sptlrpc_cld)) {
+ CERROR("can't create sptlrpc log: %s\n", seclogname);
+ rc = PTR_ERR(sptlrpc_cld);
+ goto out_err;
+ }
+ }
+ params_cld = config_params_log_add(obd, cfg, sb);
+ if (IS_ERR(params_cld)) {
+ rc = PTR_ERR(params_cld);
+ CERROR("%s: can't create params log: rc = %d\n",
+ obd->obd_name, rc);
+ goto out_err1;
+ }
+
+ cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+ if (IS_ERR(cld)) {
+ CERROR("can't create log: %s\n", logname);
+ rc = PTR_ERR(cld);
+ goto out_err2;
+ }
+
+ cld->cld_sptlrpc = sptlrpc_cld;
+ cld->cld_params = params_cld;
+
+ LASSERT(lsi->lsi_lmd);
+ if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
+ struct config_llog_data *recover_cld;
+ *strrchr(seclogname, '-') = 0;
+ recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+ if (IS_ERR(recover_cld)) {
+ rc = PTR_ERR(recover_cld);
+ goto out_err3;
+ }
+ cld->cld_recover = recover_cld;
+ }
+
+ return 0;
+
+out_err3:
+ config_log_put(cld);
+
+out_err2:
+ config_log_put(params_cld);
+
+out_err1:
+ config_log_put(sptlrpc_cld);
+
+out_err:
+ return rc;
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+ struct config_llog_data *cld;
+ struct config_llog_data *cld_sptlrpc = NULL;
+ struct config_llog_data *cld_params = NULL;
+ struct config_llog_data *cld_recover = NULL;
+ int rc = 0;
+
+ cld = config_log_find(logname, cfg);
+ if (cld == NULL)
+ return -ENOENT;
+
+ mutex_lock(&cld->cld_lock);
+ /*
+ * if cld_stopping is set, it means we didn't start the log thus
+ * not owning the start ref. this can happen after previous umount:
+ * the cld still hanging there waiting for lock cancel, and we
+ * remount again but failed in the middle and call log_end without
+ * calling start_log.
+ */
+ if (unlikely(cld->cld_stopping)) {
+ mutex_unlock(&cld->cld_lock);
+ /* drop the ref from the find */
+ config_log_put(cld);
+ return rc;
+ }
+
+ cld->cld_stopping = 1;
+
+ cld_recover = cld->cld_recover;
+ cld->cld_recover = NULL;
+ mutex_unlock(&cld->cld_lock);
+
+ if (cld_recover) {
+ mutex_lock(&cld_recover->cld_lock);
+ cld_recover->cld_stopping = 1;
+ mutex_unlock(&cld_recover->cld_lock);
+ config_log_put(cld_recover);
+ }
+
+ spin_lock(&config_list_lock);
+ cld_sptlrpc = cld->cld_sptlrpc;
+ cld->cld_sptlrpc = NULL;
+ cld_params = cld->cld_params;
+ cld->cld_params = NULL;
+ spin_unlock(&config_list_lock);
+
+ if (cld_sptlrpc)
+ config_log_put(cld_sptlrpc);
+
+ if (cld_params) {
+ mutex_lock(&cld_params->cld_lock);
+ cld_params->cld_stopping = 1;
+ mutex_unlock(&cld_params->cld_lock);
+ config_log_put(cld_params);
+ }
+
+ /* drop the ref from the find */
+ config_log_put(cld);
+ /* drop the start ref */
+ config_log_put(cld);
+
+ CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+ rc);
+ return rc;
+}
+
+#if defined (CONFIG_PROC_FS)
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_import *imp;
+ struct obd_connect_data *ocd;
+ struct config_llog_data *cld;
+
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+ ocd = &imp->imp_connect_data;
+
+ seq_printf(m, "imperative_recovery: %s\n",
+ OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED");
+ seq_printf(m, "client_state:\n");
+
+ spin_lock(&config_list_lock);
+ list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+ if (cld->cld_recover == NULL)
+ continue;
+ seq_printf(m, " - { client: %s, nidtbl_version: %u }\n",
+ cld->cld_logname,
+ cld->cld_recover->cld_cfg.cfg_last_idx);
+ }
+ spin_unlock(&config_list_lock);
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+#endif
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING 0x1
+#define RQ_NOW 0x2
+#define RQ_LATER 0x4
+#define RQ_STOP 0x8
+#define RQ_PRECLEANUP 0x10
+static int rq_state;
+static wait_queue_head_t rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+static DECLARE_COMPLETION(rq_start);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+ LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+ /* Do not run mgc_process_log on a disconnected export or an
+ export which is being disconnected. Take the client
+ semaphore to make the check non-racy. */
+ down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+ if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+ CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+ mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+ } else {
+ CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+ cld->cld_logname);
+ }
+ up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+}
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS 5
+#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
+
+static int mgc_requeue_thread(void *data)
+{
+ bool first = true;
+
+ CDEBUG(D_MGC, "Starting requeue thread\n");
+
+ /* Keep trying failed locks periodically */
+ spin_lock(&config_list_lock);
+ rq_state |= RQ_RUNNING;
+ while (1) {
+ struct l_wait_info lwi;
+ struct config_llog_data *cld, *cld_prev;
+ int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC;
+ int stopped = !!(rq_state & RQ_STOP);
+ int to;
+
+ /* Any new or requeued lostlocks will change the state */
+ rq_state &= ~(RQ_NOW | RQ_LATER);
+ spin_unlock(&config_list_lock);
+
+ if (first) {
+ first = false;
+ complete(&rq_start);
+ }
+
+ /* Always wait a few seconds to allow the server who
+ caused the lock revocation to finish its setup, plus some
+ random so everyone doesn't try to reconnect at once. */
+ to = MGC_TIMEOUT_MIN_SECONDS * HZ;
+ to += rand * HZ / 100; /* rand is centi-seconds */
+ lwi = LWI_TIMEOUT(to, NULL, NULL);
+ l_wait_event(rq_waitq, rq_state & (RQ_STOP | RQ_PRECLEANUP),
+ &lwi);
+
+ /*
+ * iterate & processing through the list. for each cld, process
+ * its depending sptlrpc cld firstly (if any) and then itself.
+ *
+ * it's guaranteed any item in the list must have
+ * reference > 0; and if cld_lostlock is set, at
+ * least one reference is taken by the previous enqueue.
+ */
+ cld_prev = NULL;
+
+ spin_lock(&config_list_lock);
+ rq_state &= ~RQ_PRECLEANUP;
+ list_for_each_entry(cld, &config_llog_list,
+ cld_list_chain) {
+ if (!cld->cld_lostlock)
+ continue;
+
+ spin_unlock(&config_list_lock);
+
+ LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+ /* Whether we enqueued again or not in mgc_process_log,
+ * we're done with the ref from the old enqueue */
+ if (cld_prev)
+ config_log_put(cld_prev);
+ cld_prev = cld;
+
+ cld->cld_lostlock = 0;
+ if (likely(!stopped))
+ do_requeue(cld);
+
+ spin_lock(&config_list_lock);
+ }
+ spin_unlock(&config_list_lock);
+ if (cld_prev)
+ config_log_put(cld_prev);
+
+ /* break after scanning the list so that we can drop
+ * refcount to losing lock clds */
+ if (unlikely(stopped)) {
+ spin_lock(&config_list_lock);
+ break;
+ }
+
+ /* Wait a bit to see if anyone else needs a requeue */
+ lwi = (struct l_wait_info) { 0 };
+ l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP),
+ &lwi);
+ spin_lock(&config_list_lock);
+ }
+ /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+ rq_state &= ~RQ_RUNNING;
+ spin_unlock(&config_list_lock);
+
+ complete(&rq_exit);
+
+ CDEBUG(D_MGC, "Ending requeue thread\n");
+ return 0;
+}
+
+/* Add a cld to the list to requeue. Start the requeue thread if needed.
+ We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+ CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+ cld->cld_logname, atomic_read(&cld->cld_refcount),
+ cld->cld_stopping, rq_state);
+ LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+ mutex_lock(&cld->cld_lock);
+ if (cld->cld_stopping || cld->cld_lostlock) {
+ mutex_unlock(&cld->cld_lock);
+ return;
+ }
+ /* this refcount will be released in mgc_requeue_thread. */
+ config_log_get(cld);
+ cld->cld_lostlock = 1;
+ mutex_unlock(&cld->cld_lock);
+
+ /* Hold lock for rq_state */
+ spin_lock(&config_list_lock);
+ if (rq_state & RQ_STOP) {
+ spin_unlock(&config_list_lock);
+ cld->cld_lostlock = 0;
+ config_log_put(cld);
+ } else {
+ rq_state |= RQ_NOW;
+ spin_unlock(&config_list_lock);
+ wake_up(&rq_waitq);
+ }
+}
+
+static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd)
+{
+ struct llog_ctxt *ctxt;
+ int rc;
+
+ /* setup only remote ctxt, the local disk context is switched per each
+ * filesystem during mgc_fs_setup() */
+ rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd,
+ &llog_client_ops);
+ if (rc)
+ return rc;
+
+ ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+ LASSERT(ctxt);
+
+ llog_initiator_connect(ctxt);
+ llog_ctxt_put(ctxt);
+
+ return 0;
+}
+
+static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd)
+{
+ struct llog_ctxt *ctxt;
+
+ ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+ if (ctxt)
+ llog_cleanup(env, ctxt);
+
+ return 0;
+}
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+ int rc = 0;
+ int temp;
+
+ switch (stage) {
+ case OBD_CLEANUP_EARLY:
+ break;
+ case OBD_CLEANUP_EXPORTS:
+ if (atomic_dec_and_test(&mgc_count)) {
+ LASSERT(rq_state & RQ_RUNNING);
+ /* stop requeue thread */
+ temp = RQ_STOP;
+ } else {
+ /* wakeup requeue thread to clean our cld */
+ temp = RQ_NOW | RQ_PRECLEANUP;
+ }
+ spin_lock(&config_list_lock);
+ rq_state |= temp;
+ spin_unlock(&config_list_lock);
+ wake_up(&rq_waitq);
+ if (temp & RQ_STOP)
+ wait_for_completion(&rq_exit);
+ obd_cleanup_client_import(obd);
+ rc = mgc_llog_fini(NULL, obd);
+ if (rc != 0)
+ CERROR("failed to cleanup llogging subsystems\n");
+ break;
+ }
+ return rc;
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+ /* COMPAT_146 - old config logs may have added profiles we don't
+ know about */
+ if (obd->obd_type->typ_refcnt <= 1)
+ /* Only for the last mgc */
+ class_del_profiles();
+
+ lprocfs_obd_cleanup(obd);
+ ptlrpcd_decref();
+
+ return client_obd_cleanup(obd);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct lprocfs_static_vars lvars;
+ int rc;
+
+ ptlrpcd_addref();
+
+ rc = client_obd_setup(obd, lcfg);
+ if (rc)
+ goto err_decref;
+
+ rc = mgc_llog_init(NULL, obd);
+ if (rc) {
+ CERROR("failed to setup llogging subsystems\n");
+ goto err_cleanup;
+ }
+
+ lprocfs_mgc_init_vars(&lvars);
+ lprocfs_obd_setup(obd, lvars.obd_vars);
+ sptlrpc_lprocfs_cliobd_attach(obd);
+
+ if (atomic_inc_return(&mgc_count) == 1) {
+ rq_state = 0;
+ init_waitqueue_head(&rq_waitq);
+
+ /* start requeue thread */
+ rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL,
+ "ll_cfg_requeue"));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("%s: Cannot start requeue thread (%d),no more log updates!\n",
+ obd->obd_name, rc);
+ goto err_cleanup;
+ }
+ /* rc is the task_struct pointer of mgc_requeue_thread. */
+ rc = 0;
+ wait_for_completion(&rq_start);
+ }
+
+ return rc;
+
+err_cleanup:
+ client_obd_cleanup(obd);
+err_decref:
+ ptlrpcd_decref();
+ return rc;
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+ void *data, int flag)
+{
+ struct lustre_handle lockh;
+ struct config_llog_data *cld = (struct config_llog_data *)data;
+ int rc = 0;
+
+ switch (flag) {
+ case LDLM_CB_BLOCKING:
+ /* mgs wants the lock, give it up... */
+ LDLM_DEBUG(lock, "MGC blocking CB");
+ ldlm_lock2handle(lock, &lockh);
+ rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+ break;
+ case LDLM_CB_CANCELING:
+ /* We've given up the lock, prepare ourselves to update. */
+ LDLM_DEBUG(lock, "MGC cancel CB");
+
+ CDEBUG(D_MGC, "Lock res "DLDLMRES" (%.8s)\n",
+ PLDLMRES(lock->l_resource),
+ (char *)&lock->l_resource->lr_name.name[0]);
+
+ if (!cld) {
+ CDEBUG(D_INFO, "missing data, won't requeue\n");
+ break;
+ }
+
+ /* held at mgc_process_log(). */
+ LASSERT(atomic_read(&cld->cld_refcount) > 0);
+ /* Are we done with this log? */
+ if (cld->cld_stopping) {
+ CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+ cld->cld_logname);
+ config_log_put(cld);
+ break;
+ }
+ /* Make sure not to re-enqueue when the mgc is stopping
+ (we get called from client_disconnect_export) */
+ if (!lock->l_conn_export ||
+ !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+ CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+ cld->cld_logname);
+ config_log_put(cld);
+ break;
+ }
+
+ /* Re-enqueue now */
+ mgc_requeue_add(cld);
+ config_log_put(cld);
+ break;
+ default:
+ LBUG();
+ }
+
+ return rc;
+}
+
+/* Not sure where this should go... */
+/* This is the timeout value for MGS_CONNECT request plus a ping interval, such
+ * that we can have a chance to try the secondary MGS if any. */
+#define MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \
+ + PING_INTERVAL)
+#define MGC_TARGET_REG_LIMIT 10
+#define MGC_SEND_PARAM_LIMIT 10
+
+/* Send parameter to MGS*/
+static int mgc_set_mgs_param(struct obd_export *exp,
+ struct mgs_send_param *msp)
+{
+ struct ptlrpc_request *req;
+ struct mgs_send_param *req_msp, *rep_msp;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION,
+ MGS_SET_INFO);
+ if (!req)
+ return -ENOMEM;
+
+ req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+ if (!req_msp) {
+ ptlrpc_req_finished(req);
+ return -ENOMEM;
+ }
+
+ memcpy(req_msp, msp, sizeof(*req_msp));
+ ptlrpc_request_set_replen(req);
+
+ /* Limit how long we will wait for the enqueue to complete */
+ req->rq_delay_limit = MGC_SEND_PARAM_LIMIT;
+ rc = ptlrpc_queue_wait(req);
+ if (!rc) {
+ rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+ memcpy(msp, rep_msp, sizeof(*rep_msp));
+ }
+
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
+ __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+ __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb,
+ void *data, __u32 lvb_len, void *lvb_swabber,
+ struct lustre_handle *lockh)
+{
+ struct config_llog_data *cld = (struct config_llog_data *)data;
+ struct ldlm_enqueue_info einfo = {
+ .ei_type = type,
+ .ei_mode = mode,
+ .ei_cb_bl = mgc_blocking_ast,
+ .ei_cb_cp = ldlm_completion_ast,
+ };
+ struct ptlrpc_request *req;
+ int short_limit = cld_is_sptlrpc(cld);
+ int rc;
+
+ CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname,
+ cld->cld_resid.name[0]);
+
+ /* We need a callback for every lockholder, so don't try to
+ ldlm_lock_match (see rev 1.1.2.11.2.47) */
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+ LDLM_ENQUEUE);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+ ptlrpc_request_set_replen(req);
+
+ /* check if this is server or client */
+ if (cld->cld_cfg.cfg_sb) {
+ struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+ if (lsi && IS_SERVER(lsi))
+ short_limit = 1;
+ }
+ /* Limit how long we will wait for the enqueue to complete */
+ req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+ rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+ NULL, 0, LVB_T_NONE, lockh, 0);
+ /* A failed enqueue should still call the mgc_blocking_ast,
+ where it will be requeued if needed ("grant failed"). */
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+ /* wakeup mgc_requeue_thread to requeue mgc lock */
+ spin_lock(&config_list_lock);
+ rq_state |= RQ_NOW;
+ spin_unlock(&config_list_lock);
+ wake_up(&rq_waitq);
+
+ /* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+ struct mgs_target_info *mti)
+{
+ struct ptlrpc_request *req;
+ struct mgs_target_info *req_mti, *rep_mti;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+ MGS_TARGET_REG);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+ if (!req_mti) {
+ ptlrpc_req_finished(req);
+ return -ENOMEM;
+ }
+
+ memcpy(req_mti, mti, sizeof(*req_mti));
+ ptlrpc_request_set_replen(req);
+ CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+ /* Limit how long we will wait for the enqueue to complete */
+ req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+ rc = ptlrpc_queue_wait(req);
+ if (!rc) {
+ rep_mti = req_capsule_server_get(&req->rq_pill,
+ &RMF_MGS_TARGET_INFO);
+ memcpy(mti, rep_mti, sizeof(*rep_mti));
+ CDEBUG(D_MGC, "register %s got index = %d\n",
+ mti->mti_svname, mti->mti_stripe_index);
+ }
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+
+static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, u32 vallen,
+ void *val, struct ptlrpc_request_set *set)
+{
+ int rc = -EINVAL;
+
+ /* Turn off initial_recov after we try all backup servers once */
+ if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+ struct obd_import *imp = class_exp2cliimp(exp);
+ int value;
+ if (vallen != sizeof(int))
+ return -EINVAL;
+ value = *(int *)val;
+ CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+ imp->imp_obd->obd_name, value,
+ imp->imp_deactive, imp->imp_invalid,
+ imp->imp_replayable, imp->imp_obd->obd_replayable,
+ ptlrpc_import_state_name(imp->imp_state));
+ /* Resurrect if we previously died */
+ if ((imp->imp_state != LUSTRE_IMP_FULL &&
+ imp->imp_state != LUSTRE_IMP_NEW) || value > 1)
+ ptlrpc_reconnect_import(imp);
+ return 0;
+ }
+ if (KEY_IS(KEY_SET_INFO)) {
+ struct mgs_send_param *msp;
+
+ msp = (struct mgs_send_param *)val;
+ rc = mgc_set_mgs_param(exp, msp);
+ return rc;
+ }
+ if (KEY_IS(KEY_MGSSEC)) {
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ struct sptlrpc_flavor flvr;
+
+ /*
+ * empty string means using current flavor, if which haven't
+ * been set yet, set it as null.
+ *
+ * if flavor has been set previously, check the asking flavor
+ * must match the existing one.
+ */
+ if (vallen == 0) {
+ if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+ return 0;
+ val = "null";
+ vallen = 4;
+ }
+
+ rc = sptlrpc_parse_flavor(val, &flvr);
+ if (rc) {
+ CERROR("invalid sptlrpc flavor %s to MGS\n",
+ (char *) val);
+ return rc;
+ }
+
+ /*
+ * caller already hold a mutex
+ */
+ if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+ cli->cl_flvr_mgc = flvr;
+ } else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+ sizeof(flvr)) != 0) {
+ char str[20];
+
+ sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+ str, sizeof(str));
+ LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but currently %s is in use\n",
+ (char *) val, str);
+ rc = -EPERM;
+ }
+ return rc;
+ }
+
+ return rc;
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+ __u32 keylen, void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *unused)
+{
+ int rc = -EINVAL;
+
+ if (KEY_IS(KEY_CONN_DATA)) {
+ struct obd_import *imp = class_exp2cliimp(exp);
+ struct obd_connect_data *data = val;
+
+ if (*vallen == sizeof(*data)) {
+ *data = imp->imp_connect_data;
+ rc = 0;
+ }
+ }
+
+ return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+ struct obd_import *imp,
+ enum obd_import_event event)
+{
+ LASSERT(imp->imp_obd == obd);
+ CDEBUG(D_MGC, "import event %#x\n", event);
+
+ switch (event) {
+ case IMP_EVENT_DISCON:
+ /* MGC imports should not wait for recovery */
+ if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+ ptlrpc_pinger_ir_down();
+ break;
+ case IMP_EVENT_INACTIVE:
+ break;
+ case IMP_EVENT_INVALIDATE: {
+ struct ldlm_namespace *ns = obd->obd_namespace;
+ ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+ break;
+ }
+ case IMP_EVENT_ACTIVE:
+ CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+ /* Clearing obd_no_recov allows us to continue pinging */
+ obd->obd_no_recov = 0;
+ mgc_notify_active(obd);
+ if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+ ptlrpc_pinger_ir_up();
+ break;
+ case IMP_EVENT_OCD:
+ break;
+ case IMP_EVENT_DEACTIVATE:
+ case IMP_EVENT_ACTIVATE:
+ break;
+ default:
+ CERROR("Unknown import event %#x\n", event);
+ LBUG();
+ }
+ return 0;
+}
+
+enum {
+ CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT),
+ CONFIG_READ_NRPAGES = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+ struct config_llog_data *cld,
+ __u64 max_version,
+ void *data, int datalen, bool mne_swab)
+{
+ struct config_llog_instance *cfg = &cld->cld_cfg;
+ struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb);
+ struct mgs_nidtbl_entry *entry;
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs bufs;
+ u64 prev_version = 0;
+ char *inst;
+ char *buf;
+ int bufsz;
+ int pos;
+ int rc = 0;
+ int off = 0;
+
+ LASSERT(cfg->cfg_instance != NULL);
+ LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+
+ OBD_ALLOC(inst, PAGE_CACHE_SIZE);
+ if (inst == NULL)
+ return -ENOMEM;
+
+ if (!IS_SERVER(lsi)) {
+ pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance);
+ if (pos >= PAGE_CACHE_SIZE) {
+ OBD_FREE(inst, PAGE_CACHE_SIZE);
+ return -E2BIG;
+ }
+ } else {
+ LASSERT(IS_MDT(lsi));
+ rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+ PAGE_CACHE_SIZE);
+ if (rc) {
+ OBD_FREE(inst, PAGE_CACHE_SIZE);
+ return -EINVAL;
+ }
+ pos = strlen(inst);
+ }
+
+ ++pos;
+ buf = inst + pos;
+ bufsz = PAGE_CACHE_SIZE - pos;
+
+ while (datalen > 0) {
+ int entry_len = sizeof(*entry);
+ int is_ost;
+ struct obd_device *obd;
+ char *obdname;
+ char *cname;
+ char *params;
+ char *uuid;
+
+ rc = -EINVAL;
+ if (datalen < sizeof(*entry))
+ break;
+
+ entry = (typeof(entry))(data + off);
+
+ /* sanity check */
+ if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+ break;
+ if (entry->mne_nid_count == 0) /* at least one nid entry */
+ break;
+ if (entry->mne_nid_size != sizeof(lnet_nid_t))
+ break;
+
+ entry_len += entry->mne_nid_count * entry->mne_nid_size;
+ if (datalen < entry_len) /* must have entry_len at least */
+ break;
+
+ /* Keep this swab for normal mixed endian handling. LU-1644 */
+ if (mne_swab)
+ lustre_swab_mgs_nidtbl_entry(entry);
+ if (entry->mne_length > PAGE_CACHE_SIZE) {
+ CERROR("MNE too large (%u)\n", entry->mne_length);
+ break;
+ }
+
+ if (entry->mne_length < entry_len)
+ break;
+
+ off += entry->mne_length;
+ datalen -= entry->mne_length;
+ if (datalen < 0)
+ break;
+
+ if (entry->mne_version > max_version) {
+ CERROR("entry index(%lld) is over max_index(%lld)\n",
+ entry->mne_version, max_version);
+ break;
+ }
+
+ if (prev_version >= entry->mne_version) {
+ CERROR("index unsorted, prev %lld, now %lld\n",
+ prev_version, entry->mne_version);
+ break;
+ }
+ prev_version = entry->mne_version;
+
+ /*
+ * Write a string with format "nid::instance" to
+ * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+ */
+
+ is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+ memset(buf, 0, bufsz);
+ obdname = buf;
+ pos = 0;
+
+ /* lustre-OST0001-osc-<instance #> */
+ strcpy(obdname, cld->cld_logname);
+ cname = strrchr(obdname, '-');
+ if (cname == NULL) {
+ CERROR("mgc %s: invalid logname %s\n",
+ mgc->obd_name, obdname);
+ break;
+ }
+
+ pos = cname - obdname;
+ obdname[pos] = 0;
+ pos += sprintf(obdname + pos, "-%s%04x",
+ is_ost ? "OST" : "MDT", entry->mne_index);
+
+ cname = is_ost ? "osc" : "mdc",
+ pos += sprintf(obdname + pos, "-%s-%s", cname, inst);
+ lustre_cfg_bufs_reset(&bufs, obdname);
+
+ /* find the obd by obdname */
+ obd = class_name2obd(obdname);
+ if (obd == NULL) {
+ CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+ mgc->obd_name, obdname);
+ rc = 0;
+ /* this is a safe race, when the ost is starting up...*/
+ continue;
+ }
+
+ /* osc.import = "connection=<Conn UUID>::<target instance>" */
+ ++pos;
+ params = buf + pos;
+ pos += sprintf(params, "%s.import=%s", cname, "connection=");
+ uuid = buf + pos;
+
+ down_read(&obd->u.cli.cl_sem);
+ if (obd->u.cli.cl_import == NULL) {
+ /* client does not connect to the OST yet */
+ up_read(&obd->u.cli.cl_sem);
+ rc = 0;
+ continue;
+ }
+
+ /* TODO: iterate all nids to find one */
+ /* find uuid by nid */
+ rc = client_import_find_conn(obd->u.cli.cl_import,
+ entry->u.nids[0],
+ (struct obd_uuid *)uuid);
+ up_read(&obd->u.cli.cl_sem);
+ if (rc < 0) {
+ CERROR("mgc: cannot find uuid by nid %s\n",
+ libcfs_nid2str(entry->u.nids[0]));
+ break;
+ }
+
+ CDEBUG(D_INFO, "Find uuid %s by nid %s\n",
+ uuid, libcfs_nid2str(entry->u.nids[0]));
+
+ pos += strlen(uuid);
+ pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+ LASSERT(pos < bufsz);
+
+ lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+ rc = -ENOMEM;
+ lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+ if (lcfg == NULL) {
+ CERROR("mgc: cannot allocate memory\n");
+ break;
+ }
+
+ CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n",
+ prev_version, max_version, obdname, params);
+
+ rc = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
+ if (rc)
+ CDEBUG(D_INFO, "process config for %s error %d\n",
+ obdname, rc);
+
+ /* continue, even one with error */
+ }
+
+ OBD_FREE(inst, PAGE_CACHE_SIZE);
+ return rc;
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs.
+ */
+static int mgc_process_recover_log(struct obd_device *obd,
+ struct config_llog_data *cld)
+{
+ struct ptlrpc_request *req = NULL;
+ struct config_llog_instance *cfg = &cld->cld_cfg;
+ struct mgs_config_body *body;
+ struct mgs_config_res *res;
+ struct ptlrpc_bulk_desc *desc;
+ struct page **pages;
+ int nrpages;
+ bool eof = true;
+ bool mne_swab = false;
+ int i;
+ int ealen;
+ int rc;
+
+ /* allocate buffer for bulk transfer.
+ * if this is the first time for this mgs to read logs,
+ * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+ * once; otherwise, it only reads increment of logs, this should be
+ * small and CONFIG_READ_NRPAGES will be used.
+ */
+ nrpages = CONFIG_READ_NRPAGES;
+ if (cfg->cfg_last_idx == 0) /* the first time */
+ nrpages = CONFIG_READ_NRPAGES_INIT;
+
+ OBD_ALLOC(pages, sizeof(*pages) * nrpages);
+ if (pages == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < nrpages; i++) {
+ pages[i] = alloc_page(GFP_IOFS);
+ if (pages[i] == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+again:
+ LASSERT(cld_is_recover(cld));
+ LASSERT(mutex_is_locked(&cld->cld_lock));
+ req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+ &RQF_MGS_CONFIG_READ);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+ if (rc)
+ goto out;
+
+ /* pack request */
+ body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+ LASSERT(body != NULL);
+ LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+ if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+ >= sizeof(body->mcb_name)) {
+ rc = -E2BIG;
+ goto out;
+ }
+ body->mcb_offset = cfg->cfg_last_idx + 1;
+ body->mcb_type = cld->cld_type;
+ body->mcb_bits = PAGE_CACHE_SHIFT;
+ body->mcb_units = nrpages;
+
+ /* allocate bulk transfer descriptor */
+ desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK,
+ MGS_BULK_PORTAL);
+ if (desc == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < nrpages; i++)
+ ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+ if (res->mcr_size < res->mcr_offset) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* always update the index even though it might have errors with
+ * handling the recover logs */
+ cfg->cfg_last_idx = res->mcr_offset;
+ eof = res->mcr_offset == res->mcr_size;
+
+ CDEBUG(D_INFO, "Latest version %lld, more %d.\n",
+ res->mcr_offset, eof == false);
+
+ ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+ if (ealen < 0) {
+ rc = ealen;
+ goto out;
+ }
+
+ if (ealen > nrpages << PAGE_CACHE_SHIFT) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (ealen == 0) { /* no logs transferred */
+ if (!eof)
+ rc = -EINVAL;
+ goto out;
+ }
+
+ mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+ /* This import flag means the server did an extra swab of IR MNE
+ * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+ if (unlikely(req->rq_import->imp_need_mne_swab))
+ mne_swab = !mne_swab;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+ for (i = 0; i < nrpages && ealen > 0; i++) {
+ int rc2;
+ void *ptr;
+
+ ptr = kmap(pages[i]);
+ rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
+ min_t(int, ealen, PAGE_CACHE_SIZE),
+ mne_swab);
+ kunmap(pages[i]);
+ if (rc2 < 0) {
+ CWARN("Process recover log %s error %d\n",
+ cld->cld_logname, rc2);
+ break;
+ }
+
+ ealen -= PAGE_CACHE_SIZE;
+ }
+
+out:
+ if (req)
+ ptlrpc_req_finished(req);
+
+ if (rc == 0 && !eof)
+ goto again;
+
+ if (pages) {
+ for (i = 0; i < nrpages; i++) {
+ if (pages[i] == NULL)
+ break;
+ __free_page(pages[i]);
+ }
+ OBD_FREE(pages, sizeof(*pages) * nrpages);
+ }
+ return rc;
+}
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+ struct config_llog_data *cld, int local_only)
+{
+ struct llog_ctxt *ctxt;
+ struct lustre_sb_info *lsi = NULL;
+ int rc = 0;
+ bool sptlrpc_started = false;
+ struct lu_env *env;
+
+ LASSERT(cld);
+ LASSERT(mutex_is_locked(&cld->cld_lock));
+
+ /*
+ * local copy of sptlrpc log is controlled elsewhere, don't try to
+ * read it up here.
+ */
+ if (cld_is_sptlrpc(cld) && local_only)
+ return 0;
+
+ if (cld->cld_cfg.cfg_sb)
+ lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+ OBD_ALLOC_PTR(env);
+ if (env == NULL)
+ return -ENOMEM;
+
+ rc = lu_env_init(env, LCT_MG_THREAD);
+ if (rc)
+ goto out_free;
+
+ ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+ LASSERT(ctxt);
+
+ if (local_only) /* no local log at client side */ {
+ rc = -EIO;
+ goto out_pop;
+ }
+
+ if (cld_is_sptlrpc(cld)) {
+ sptlrpc_conf_log_update_begin(cld->cld_logname);
+ sptlrpc_started = true;
+ }
+
+ /* logname and instance info should be the same, so use our
+ * copy of the instance for the update. The cfg_last_idx will
+ * be updated here. */
+ rc = class_config_parse_llog(env, ctxt, cld->cld_logname,
+ &cld->cld_cfg);
+
+out_pop:
+ __llog_ctxt_put(env, ctxt);
+
+ /*
+ * update settings on existing OBDs. doing it inside
+ * of llog_process_lock so no device is attaching/detaching
+ * in parallel.
+ * the logname must be <fsname>-sptlrpc
+ */
+ if (sptlrpc_started) {
+ LASSERT(cld_is_sptlrpc(cld));
+ sptlrpc_conf_log_update_end(cld->cld_logname);
+ class_notify_sptlrpc_conf(cld->cld_logname,
+ strlen(cld->cld_logname) -
+ strlen("-sptlrpc"));
+ }
+
+ lu_env_fini(env);
+out_free:
+ OBD_FREE_PTR(env);
+ return rc;
+}
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Copy the log locally before parsing it if appropriate (non-MGS server)
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+ struct lustre_handle lockh = { 0 };
+ __u64 flags = LDLM_FL_NO_LRU;
+ int rc = 0, rcl;
+
+ LASSERT(cld);
+
+ /* I don't want multiple processes running process_log at once --
+ sounds like badness. It actually might be fine, as long as
+ we're not trying to update from the same log
+ simultaneously (in which case we should use a per-log sem.) */
+ mutex_lock(&cld->cld_lock);
+ if (cld->cld_stopping) {
+ mutex_unlock(&cld->cld_lock);
+ return 0;
+ }
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+ CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+ cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+ /* Get the cfg lock on the llog */
+ rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL,
+ LCK_CR, &flags, NULL, NULL, NULL,
+ cld, 0, NULL, &lockh);
+ if (rcl == 0) {
+ /* Get the cld, it will be released in mgc_blocking_ast. */
+ config_log_get(cld);
+ rc = ldlm_lock_set_data(&lockh, (void *)cld);
+ LASSERT(rc == 0);
+ } else {
+ CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+ /* mark cld_lostlock so that it will requeue
+ * after MGC becomes available. */
+ cld->cld_lostlock = 1;
+ /* Get extra reference, it will be put in requeue thread */
+ config_log_get(cld);
+ }
+
+
+ if (cld_is_recover(cld)) {
+ rc = 0; /* this is not a fatal error for recover log */
+ if (rcl == 0)
+ rc = mgc_process_recover_log(mgc, cld);
+ } else {
+ rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+ }
+
+ CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+ mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+ mutex_unlock(&cld->cld_lock);
+
+ /* Now drop the lock so MGS can revoke it */
+ if (!rcl)
+ ldlm_lock_decref(&lockh, LCK_CR);
+
+ return rc;
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+ struct lustre_cfg *lcfg = buf;
+ struct config_llog_instance *cfg = NULL;
+ char *logname;
+ int rc = 0;
+
+ switch (lcfg->lcfg_command) {
+ case LCFG_LOV_ADD_OBD: {
+ /* Overloading this cfg command: register a new target */
+ struct mgs_target_info *mti;
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+ sizeof(struct mgs_target_info)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+ CDEBUG(D_MGC, "add_target %s %#x\n",
+ mti->mti_svname, mti->mti_flags);
+ rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+ break;
+ }
+ case LCFG_LOV_DEL_OBD:
+ /* Unregister has no meaning at the moment. */
+ CERROR("lov_del_obd unimplemented\n");
+ rc = -ENOSYS;
+ break;
+ case LCFG_SPTLRPC_CONF: {
+ rc = sptlrpc_process_config(lcfg);
+ break;
+ }
+ case LCFG_LOG_START: {
+ struct config_llog_data *cld;
+ struct super_block *sb;
+
+ logname = lustre_cfg_string(lcfg, 1);
+ cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+ sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+ CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+ cfg->cfg_last_idx);
+
+ /* We're only called through here on the initial mount */
+ rc = config_log_add(obd, logname, cfg, sb);
+ if (rc)
+ break;
+ cld = config_log_find(logname, cfg);
+ if (cld == NULL) {
+ rc = -ENOENT;
+ break;
+ }
+
+ /* COMPAT_146 */
+ /* FIXME only set this for old logs! Right now this forces
+ us to always skip the "inside markers" check */
+ cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
+ rc = mgc_process_log(obd, cld);
+ if (rc == 0 && cld->cld_recover != NULL) {
+ if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+ imp_connect_data, IMP_RECOV)) {
+ rc = mgc_process_log(obd, cld->cld_recover);
+ } else {
+ struct config_llog_data *cir = cld->cld_recover;
+ cld->cld_recover = NULL;
+ config_log_put(cir);
+ }
+ if (rc)
+ CERROR("Cannot process recover llog %d\n", rc);
+ }
+
+ if (rc == 0 && cld->cld_params != NULL) {
+ rc = mgc_process_log(obd, cld->cld_params);
+ if (rc == -ENOENT) {
+ CDEBUG(D_MGC,
+ "There is no params config file yet\n");
+ rc = 0;
+ }
+ /* params log is optional */
+ if (rc)
+ CERROR(
+ "%s: can't process params llog: rc = %d\n",
+ obd->obd_name, rc);
+ }
+ config_log_put(cld);
+
+ break;
+ }
+ case LCFG_LOG_END: {
+ logname = lustre_cfg_string(lcfg, 1);
+
+ if (lcfg->lcfg_bufcount >= 2)
+ cfg = (struct config_llog_instance *)lustre_cfg_buf(
+ lcfg, 2);
+ rc = config_log_end(logname, cfg);
+ break;
+ }
+ default: {
+ CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+ rc = -EINVAL;
+ goto out;
+
+ }
+ }
+out:
+ return rc;
+}
+
+struct obd_ops mgc_obd_ops = {
+ .o_owner = THIS_MODULE,
+ .o_setup = mgc_setup,
+ .o_precleanup = mgc_precleanup,
+ .o_cleanup = mgc_cleanup,
+ .o_add_conn = client_import_add_conn,
+ .o_del_conn = client_import_del_conn,
+ .o_connect = client_connect_import,
+ .o_disconnect = client_disconnect_export,
+ /* .o_enqueue = mgc_enqueue, */
+ /* .o_iocontrol = mgc_iocontrol, */
+ .o_set_info_async = mgc_set_info_async,
+ .o_get_info = mgc_get_info,
+ .o_import_event = mgc_import_event,
+ .o_process_config = mgc_process_config,
+};
+
+static int __init mgc_init(void)
+{
+ return class_register_type(&mgc_obd_ops, NULL, NULL,
+ LUSTRE_MGC_NAME, NULL);
+}
+
+static void /*__exit*/ mgc_exit(void)
+{
+ class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644
index 000000000..e89468179
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+ llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+ genops.o uuid.o lprocfs_status.o \
+ lustre_handles.o lustre_peer.o \
+ statfs_pack.o obdo.o obd_config.o obd_mount.o \
+ lu_object.o dt_object.o capa.o cl_object.o \
+ cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o
+
+obdclass-$(CONFIG_PROC_FS) += lprocfs_counters.o
diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644
index 000000000..9a69f6b35
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/acl.c
@@ -0,0 +1,548 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include "../include/lu_object.h"
+#include "../include/lustre_acl.h"
+#include "../include/lustre_eacl.h"
+#include "../include/obd_support.h"
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+ ES_UNK = 0, /* unknown stat */
+ ES_UNC = 1, /* ACL entry is not changed */
+ ES_MOD = 2, /* ACL entry is modified */
+ ES_ADD = 3, /* ACL entry is added */
+ ES_DEL = 4 /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+ ext_acl_xattr_entry *s)
+{
+ d->e_tag = le16_to_cpu(s->e_tag);
+ d->e_perm = le16_to_cpu(s->e_perm);
+ d->e_id = le32_to_cpu(s->e_id);
+ d->e_stat = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+ ext_acl_xattr_entry *s)
+{
+ d->e_tag = cpu_to_le16(s->e_tag);
+ d->e_perm = cpu_to_le16(s->e_perm);
+ d->e_id = cpu_to_le32(s->e_id);
+ d->e_stat = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+ posix_acl_xattr_entry *s)
+{
+ d->e_tag = le16_to_cpu(s->e_tag);
+ d->e_perm = le16_to_cpu(s->e_perm);
+ d->e_id = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+ posix_acl_xattr_entry *s)
+{
+ d->e_tag = cpu_to_le16(s->e_tag);
+ d->e_perm = cpu_to_le16(s->e_perm);
+ d->e_id = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+ int old_count, int new_count)
+{
+ int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+ int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+ posix_acl_xattr_header *new;
+
+ if (unlikely(old_count <= new_count))
+ return old_size;
+
+ OBD_ALLOC(new, new_size);
+ if (unlikely(new == NULL))
+ return -ENOMEM;
+
+ memcpy(new, *header, new_size);
+ OBD_FREE(*header, old_size);
+ *header = new;
+ return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+ int old_count)
+{
+ int ext_count = le32_to_cpu((*header)->a_count);
+ int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+ int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+ ext_acl_xattr_header *new;
+
+ if (unlikely(old_count <= ext_count))
+ return 0;
+
+ OBD_ALLOC(new, ext_size);
+ if (unlikely(new == NULL))
+ return -ENOMEM;
+
+ memcpy(new, *header, ext_size);
+ OBD_FREE(*header, old_size);
+ *header = new;
+ return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+ int count, i, esize;
+ ext_acl_xattr_header *new;
+
+ if (unlikely(size < 0))
+ return ERR_PTR(-EINVAL);
+ else if (!size)
+ count = 0;
+ else
+ count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+ OBD_ALLOC(new, esize);
+ if (unlikely(new == NULL))
+ return ERR_PTR(-ENOMEM);
+
+ new->a_count = cpu_to_le32(count);
+ for (i = 0; i < count; i++) {
+ new->a_entries[i].e_tag = header->a_entries[i].e_tag;
+ new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+ new->a_entries[i].e_id = header->a_entries[i].e_id;
+ new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+ }
+
+ return new;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, size_t size,
+ posix_acl_xattr_header **out)
+{
+ int count, i, j, rc = 0;
+ __u32 id;
+ posix_acl_xattr_header *new;
+
+ if (!size)
+ return 0;
+ if (size < sizeof(*new))
+ return -EINVAL;
+
+ OBD_ALLOC(new, size);
+ if (unlikely(new == NULL))
+ return -ENOMEM;
+
+ new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+ count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ for (i = 0, j = 0; i < count; i++) {
+ id = le32_to_cpu(header->a_entries[i].e_id);
+ switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ if (id != ACL_UNDEFINED_ID) {
+ rc = -EIO;
+ goto _out;
+ }
+
+ memcpy(&new->a_entries[j++], &header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ break;
+ case ACL_USER:
+ if (id != NOBODY_UID)
+ memcpy(&new->a_entries[j++],
+ &header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ break;
+ case ACL_GROUP:
+ if (id != NOBODY_GID)
+ memcpy(&new->a_entries[j++],
+ &header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ break;
+ default:
+ rc = -EIO;
+ goto _out;
+ }
+ }
+
+ /* free unused space. */
+ rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+ if (rc >= 0) {
+ size = rc;
+ *out = new;
+ rc = 0;
+ }
+
+_out:
+ if (rc) {
+ OBD_FREE(new, size);
+ size = rc;
+ }
+ return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+ OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+ OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+ ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+ posix_acl_xattr_entry *entry, int *pos)
+{
+ int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+ once = 0;
+ start = *pos;
+ end = count;
+
+again:
+ for (i = start; i < end; i++) {
+ if (header->a_entries[i].e_tag == entry->e_tag &&
+ header->a_entries[i].e_id == entry->e_id) {
+ j = i;
+ if (++i >= count)
+ i = 0;
+ *pos = i;
+ return &header->a_entries[j];
+ }
+ }
+
+ if (!once) {
+ once = 1;
+ start = 0;
+ end = *pos;
+ goto again;
+ }
+
+ return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header,
+ posix_acl_xattr_header **out)
+{
+ int posix_count, posix_size, i, j;
+ int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+ posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+ posix_acl_xattr_header *new;
+ ext_acl_xattr_entry *ee, ae;
+
+ lustre_posix_acl_cpu_to_le(&pe, &pe);
+ ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+ if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+ /* there are only base ACL entries at most. */
+ posix_count = 3;
+ posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+ OBD_ALLOC(new, posix_size);
+ if (unlikely(new == NULL))
+ return -ENOMEM;
+
+ new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+ for (i = 0, j = 0; i < ext_count; i++) {
+ lustre_ext_acl_le_to_cpu(&ae,
+ &ext_header->a_entries[i]);
+ switch (ae.e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_OTHER:
+ if (ae.e_id != ACL_UNDEFINED_ID) {
+ rc = -EIO;
+ goto _out;
+ }
+
+ if (ae.e_stat != ES_DEL) {
+ new->a_entries[j].e_tag =
+ ext_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ ext_header->a_entries[i].e_perm;
+ new->a_entries[j++].e_id =
+ ext_header->a_entries[i].e_id;
+ }
+ break;
+ case ACL_MASK:
+ case ACL_USER:
+ case ACL_GROUP:
+ if (ae.e_stat == ES_DEL)
+ break;
+ default:
+ rc = -EIO;
+ goto _out;
+ }
+ }
+ } else {
+ /* maybe there are valid ACL_USER or ACL_GROUP entries in the
+ * original server-side ACL, they are regarded as ES_UNC stat.*/
+ int ori_posix_count;
+
+ if (unlikely(size < 0))
+ return -EINVAL;
+ else if (!size)
+ ori_posix_count = 0;
+ else
+ ori_posix_count =
+ CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ posix_count = ori_posix_count + ext_count;
+ posix_size =
+ CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+ OBD_ALLOC(new, posix_size);
+ if (unlikely(new == NULL))
+ return -ENOMEM;
+
+ new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+ /* 1. process the unchanged ACL entries
+ * in the original server-side ACL. */
+ pos = 0;
+ for (i = 0, j = 0; i < ori_posix_count; i++) {
+ ee = lustre_ext_acl_xattr_search(ext_header,
+ &posix_header->a_entries[i], &pos);
+ if (ee == NULL)
+ memcpy(&new->a_entries[j++],
+ &posix_header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ }
+
+ /* 2. process the non-deleted entries
+ * from client-side extended ACL. */
+ for (i = 0; i < ext_count; i++) {
+ if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+ ES_DEL) {
+ new->a_entries[j].e_tag =
+ ext_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ ext_header->a_entries[i].e_perm;
+ new->a_entries[j++].e_id =
+ ext_header->a_entries[i].e_id;
+ }
+ }
+ }
+
+ /* free unused space. */
+ rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+ if (rc >= 0) {
+ posix_size = rc;
+ *out = new;
+ rc = 0;
+ }
+
+_out:
+ if (rc) {
+ OBD_FREE(new, posix_size);
+ posix_size = rc;
+ }
+ return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header)
+{
+ int ori_ext_count, posix_count, ext_count, ext_size;
+ int i, j, pos = 0, rc = 0;
+ posix_acl_xattr_entry pae;
+ ext_acl_xattr_header *new;
+ ext_acl_xattr_entry *ee, eae;
+
+ if (unlikely(size < 0))
+ return ERR_PTR(-EINVAL);
+ else if (!size)
+ posix_count = 0;
+ else
+ posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ ori_ext_count = le32_to_cpu(ext_header->a_count);
+ ext_count = posix_count + ori_ext_count;
+ ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+ OBD_ALLOC(new, ext_size);
+ if (unlikely(new == NULL))
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0, j = 0; i < posix_count; i++) {
+ lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+ switch (pae.e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ if (pae.e_id != ACL_UNDEFINED_ID) {
+ rc = -EIO;
+ goto out;
+ }
+ case ACL_USER:
+ /* ignore "nobody" entry. */
+ if (pae.e_id == NOBODY_UID)
+ break;
+
+ new->a_entries[j].e_tag =
+ posix_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ posix_header->a_entries[i].e_perm;
+ new->a_entries[j].e_id =
+ posix_header->a_entries[i].e_id;
+ ee = lustre_ext_acl_xattr_search(ext_header,
+ &posix_header->a_entries[i], &pos);
+ if (ee) {
+ if (posix_header->a_entries[i].e_perm !=
+ ee->e_perm)
+ /* entry modified. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_MOD);
+ else
+ /* entry unchanged. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_UNC);
+ } else {
+ /* new entry. */
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_ADD);
+ }
+ break;
+ case ACL_GROUP:
+ /* ignore "nobody" entry. */
+ if (pae.e_id == NOBODY_GID)
+ break;
+ new->a_entries[j].e_tag =
+ posix_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ posix_header->a_entries[i].e_perm;
+ new->a_entries[j].e_id =
+ posix_header->a_entries[i].e_id;
+ ee = lustre_ext_acl_xattr_search(ext_header,
+ &posix_header->a_entries[i], &pos);
+ if (ee) {
+ if (posix_header->a_entries[i].e_perm !=
+ ee->e_perm)
+ /* entry modified. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_MOD);
+ else
+ /* entry unchanged. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_UNC);
+ } else {
+ /* new entry. */
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_ADD);
+ }
+ break;
+ default:
+ rc = -EIO;
+ goto out;
+ }
+ }
+
+ /* process deleted entries. */
+ for (i = 0; i < ori_ext_count; i++) {
+ lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+ if (eae.e_stat == ES_UNK) {
+ /* ignore "nobody" entry. */
+ if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+ (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+ continue;
+
+ new->a_entries[j].e_tag =
+ ext_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ ext_header->a_entries[i].e_perm;
+ new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+ new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+ }
+ }
+
+ new->a_count = cpu_to_le32(j);
+ /* free unused space. */
+ rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+
+out:
+ if (rc) {
+ OBD_FREE(new, ext_size);
+ new = ERR_PTR(rc);
+ }
+ return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644
index 000000000..d206b1046
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/capa.c
@@ -0,0 +1,421 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+
+#include "../include/obd_class.h"
+#include "../include/lustre_debug.h"
+#include "../include/lustre/lustre_idl.h"
+
+#include <linux/list.h>
+#include "../include/lustre_capa.h"
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000 /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+ DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+static inline
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+{
+ return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
+}
+
+struct hlist_head *init_capa_hash(void)
+{
+ struct hlist_head *hash;
+ int nr_hash, i;
+
+ OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+ if (!hash)
+ return NULL;
+
+ nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+ LASSERT(nr_hash > NR_CAPAHASH);
+
+ for (i = 0; i < NR_CAPAHASH; i++)
+ INIT_HLIST_HEAD(hash + i);
+ return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+ return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+ LASSERT(capa_on_server(ocapa));
+ hlist_del_init(&ocapa->u.tgt.c_hash);
+ list_del_init(&ocapa->c_list);
+ capa_count[ocapa->c_site]--;
+ /* release the ref when alloc */
+ capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+ int i;
+ struct hlist_node *next;
+ struct obd_capa *oc;
+
+ spin_lock(&capa_lock);
+ for (i = 0; i < NR_CAPAHASH; i++) {
+ hlist_for_each_entry_safe(oc, next, hash + i,
+ u.tgt.c_hash)
+ capa_delete(oc);
+ }
+ spin_unlock(&capa_lock);
+
+ OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+ return (fid_oid(fid) ^ fid_ver(fid)) *
+ (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+ return time_before(cfs_time_sub(oc->c_expiry,
+ cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+ cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+ struct hlist_head *head, int alive)
+{
+ struct obd_capa *ocapa;
+ int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+ hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+ if (memcmp(&ocapa->c_capa, capa, len))
+ continue;
+ /* don't return one that will expire soon in this case */
+ if (alive && capa_is_to_expire(ocapa))
+ continue;
+
+ LASSERT(capa_on_server(ocapa));
+
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+ return ocapa;
+ }
+
+ return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+ struct obd_capa *ocapa;
+ struct list_head *node = head->next;
+ int count = 0;
+
+ /* free LRU_CAPA_DELETE_COUNT unused capa from head */
+ while (count++ < LRU_CAPA_DELETE_COUNT) {
+ ocapa = list_entry(node, struct obd_capa, c_list);
+ node = node->next;
+ if (atomic_read(&ocapa->c_refc))
+ continue;
+
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+ capa_delete(ocapa);
+ }
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+ struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+ struct obd_capa *ocapa, *old = NULL;
+ struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+ ocapa = alloc_capa(CAPA_SITE_SERVER);
+ if (IS_ERR(ocapa))
+ return NULL;
+
+ spin_lock(&capa_lock);
+ old = find_capa(capa, head, 0);
+ if (!old) {
+ ocapa->c_capa = *capa;
+ set_capa_expiry(ocapa);
+ hlist_add_head(&ocapa->u.tgt.c_hash, head);
+ list_add_tail(&ocapa->c_list, list);
+ capa_get(ocapa);
+ capa_count[CAPA_SITE_SERVER]++;
+ if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+ capa_delete_lru(list);
+ spin_unlock(&capa_lock);
+ return ocapa;
+ }
+ capa_get(old);
+ spin_unlock(&capa_lock);
+ capa_put(ocapa);
+ return old;
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+ int alive)
+{
+ struct obd_capa *ocapa;
+
+ spin_lock(&capa_lock);
+ ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+ if (ocapa) {
+ list_move_tail(&ocapa->c_list,
+ &capa_list[CAPA_SITE_SERVER]);
+ capa_get(ocapa);
+ }
+ spin_unlock(&capa_lock);
+
+ return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+static inline int ll_crypto_hmac(struct crypto_hash *tfm,
+ u8 *key, unsigned int *keylen,
+ struct scatterlist *sg,
+ unsigned int size, u8 *result)
+{
+ struct hash_desc desc;
+ int rv;
+ desc.tfm = tfm;
+ desc.flags = 0;
+ rv = crypto_hash_setkey(desc.tfm, key, *keylen);
+ if (rv) {
+ CERROR("failed to hash setkey: %d\n", rv);
+ return rv;
+ }
+ return crypto_hash_digest(&desc, sg, size, result);
+}
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+ struct crypto_hash *tfm;
+ struct capa_hmac_alg *alg;
+ int keylen;
+ struct scatterlist sl;
+
+ if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+ CERROR("unknown capability hmac algorithm!\n");
+ return -EFAULT;
+ }
+
+ alg = &capa_hmac_algs[capa_alg(capa)];
+
+ tfm = crypto_alloc_hash(alg->ha_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ CERROR("crypto_alloc_tfm failed, check whether your kernel has crypto support!\n");
+ return PTR_ERR(tfm);
+ }
+ keylen = alg->ha_keylen;
+
+ sg_init_table(&sl, 1);
+ sg_set_page(&sl, virt_to_page(capa),
+ offsetof(struct lustre_capa, lc_hmac),
+ (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+ ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+ crypto_free_hash(tfm);
+
+ return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+ struct crypto_blkcipher *tfm;
+ struct scatterlist sd;
+ struct scatterlist ss;
+ struct blkcipher_desc desc;
+ unsigned int min;
+ int rc;
+ char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+
+ /* passing "aes" in a variable instead of a constant string keeps gcc
+ * 4.3.2 happy */
+ tfm = crypto_alloc_blkcipher(alg, 0, 0);
+ if (IS_ERR(tfm)) {
+ CERROR("failed to load transform for aes\n");
+ return PTR_ERR(tfm);
+ }
+
+ min = ll_crypto_tfm_alg_min_keysize(tfm);
+ if (keylen < min) {
+ CERROR("keylen at least %d bits for aes\n", min * 8);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = crypto_blkcipher_setkey(tfm, key, min);
+ if (rc) {
+ CERROR("failed to setting key for aes\n");
+ goto out;
+ }
+
+ sg_init_table(&sd, 1);
+ sg_set_page(&sd, virt_to_page(d), 16,
+ (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+ sg_init_table(&ss, 1);
+ sg_set_page(&ss, virt_to_page(s), 16,
+ (unsigned long)(s) % PAGE_CACHE_SIZE);
+ desc.tfm = tfm;
+ desc.info = NULL;
+ desc.flags = 0;
+ rc = crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+ if (rc) {
+ CERROR("failed to encrypt for aes\n");
+ goto out;
+ }
+
+out:
+ crypto_free_blkcipher(tfm);
+ return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+ struct crypto_blkcipher *tfm;
+ struct scatterlist sd;
+ struct scatterlist ss;
+ struct blkcipher_desc desc;
+ unsigned int min;
+ int rc;
+ char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+
+ /* passing "aes" in a variable instead of a constant string keeps gcc
+ * 4.3.2 happy */
+ tfm = crypto_alloc_blkcipher(alg, 0, 0);
+ if (IS_ERR(tfm)) {
+ CERROR("failed to load transform for aes\n");
+ return PTR_ERR(tfm);
+ }
+
+ min = ll_crypto_tfm_alg_min_keysize(tfm);
+ if (keylen < min) {
+ CERROR("keylen at least %d bits for aes\n", min * 8);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = crypto_blkcipher_setkey(tfm, key, min);
+ if (rc) {
+ CERROR("failed to setting key for aes\n");
+ goto out;
+ }
+
+ sg_init_table(&sd, 1);
+ sg_set_page(&sd, virt_to_page(d), 16,
+ (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+ sg_init_table(&ss, 1);
+ sg_set_page(&ss, virt_to_page(s), 16,
+ (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+ desc.tfm = tfm;
+ desc.info = NULL;
+ desc.flags = 0;
+ rc = crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+ if (rc) {
+ CERROR("failed to decrypt for aes\n");
+ goto out;
+ }
+
+out:
+ crypto_free_blkcipher(tfm);
+ return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+ spin_lock(&ocapa->c_lock);
+ *(struct lustre_capa *)capa = ocapa->c_capa;
+ spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+ struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " capability@%p fid " DFID " opc %#llx uid %llu gid %llu flags %u alg %d keyid %u timeout %u expiry %u\n",
+ c, PFID(capa_fid(c)), capa_opc(c),
+ capa_uid(c), capa_gid(c), capa_flags(c),
+ capa_alg(c), capa_keyid(c), capa_timeout(c),
+ capa_expiry(c));
+ va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644
index 000000000..7eb0ad7b3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+ CNL_TOP,
+ CNL_SUB,
+ CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+ /**
+ * Number of outstanding calls to cl_lock_mutex_get() made by the
+ * current thread. For debugging.
+ */
+ int ctc_nr_locks_locked;
+ /** List of locked locks. */
+ struct lu_ref ctc_locks_locked;
+ /** Number of outstanding holds on locks. */
+ int ctc_nr_held;
+ /** Number of outstanding uses on locks. */
+ int ctc_nr_used;
+ /** Number of held extent locks. */
+ int ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+ /*
+ * Common fields.
+ */
+ struct cl_io clt_io;
+ struct cl_2queue clt_queue;
+
+ /*
+ * Fields used by cl_lock.c
+ */
+ struct cl_lock_descr clt_descr;
+ struct cl_page_list clt_list;
+ /**
+ * Counters for every level of lock nesting.
+ */
+ struct cl_thread_counters clt_counters[CNL_NR];
+ /** @} debugging */
+
+ /*
+ * Fields used by cl_page.c
+ */
+ struct cl_page *clt_pvec[CLT_PVEC_SIZE];
+
+ /*
+ * Fields used by cl_io.c
+ */
+ /**
+ * Pointer to the topmost ongoing IO in this thread.
+ */
+ struct cl_io *clt_current_io;
+ /**
+ * Used for submitting a sync io.
+ */
+ struct cl_sync_io clt_anchor;
+ /**
+ * Fields used by cl_lock_discard_pages().
+ */
+ pgoff_t clt_next_index;
+ pgoff_t clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644
index 000000000..3141b6043
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c
@@ -0,0 +1,1669 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include <linux/list.h>
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+ list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io) \
+ list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+ return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+ return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+ return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+ struct cl_io *up;
+
+ up = io->ci_parent;
+ return
+ /*
+ * io can own pages only when it is ongoing. Sub-io might
+ * still be in CIS_LOCKED state when top-io is in
+ * CIS_IO_GOING.
+ */
+ ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+ (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+ struct cl_io_slice *slice;
+ struct cl_thread_info *info;
+
+ LINVRNT(cl_io_type_is_valid(io->ci_type));
+ LINVRNT(cl_io_invariant(io));
+
+ while (!list_empty(&io->ci_layers)) {
+ slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+ cis_linkage);
+ list_del_init(&slice->cis_linkage);
+ if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+ slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+ /*
+ * Invalidate slice to catch use after free. This assumes that
+ * slices are allocated within session and can be touched
+ * after ->cio_fini() returns.
+ */
+ slice->cis_io = NULL;
+ }
+ io->ci_state = CIS_FINI;
+ info = cl_env_info(env);
+ if (info->clt_current_io == io)
+ info->clt_current_io = NULL;
+
+ /* sanity check for layout change */
+ switch (io->ci_type) {
+ case CIT_READ:
+ case CIT_WRITE:
+ break;
+ case CIT_FAULT:
+ case CIT_FSYNC:
+ LASSERT(!io->ci_need_restart);
+ break;
+ case CIT_SETATTR:
+ case CIT_MISC:
+ /* Check ignore layout change conf */
+ LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+ !io->ci_need_restart));
+ break;
+ default:
+ LBUG();
+ }
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj)
+{
+ struct cl_object *scan;
+ int result;
+
+ LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+ LINVRNT(cl_io_type_is_valid(iot));
+ LINVRNT(cl_io_invariant(io));
+
+ io->ci_type = iot;
+ INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+ INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+ INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+ INIT_LIST_HEAD(&io->ci_layers);
+
+ result = 0;
+ cl_object_for_each(scan, obj) {
+ if (scan->co_ops->coo_io_init != NULL) {
+ result = scan->co_ops->coo_io_init(env, scan, io);
+ if (result != 0)
+ break;
+ }
+ }
+ if (result == 0)
+ io->ci_state = CIS_INIT;
+ return result;
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+
+ LASSERT(obj != cl_object_top(obj));
+ if (info->clt_current_io == NULL)
+ info->clt_current_io = io;
+ return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+
+ LASSERT(obj == cl_object_top(obj));
+ LASSERT(info->clt_current_io == NULL);
+
+ info->clt_current_io = io;
+ return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, loff_t pos, size_t count)
+{
+ LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+ LINVRNT(io->ci_obj != NULL);
+
+ LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+ "io range: %u [%llu, %llu) %u %u\n",
+ iot, (__u64)pos, (__u64)pos + count,
+ io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+ io->u.ci_rw.crw_pos = pos;
+ io->u.ci_rw.crw_count = count;
+ return cl_io_init(env, io, iot, io->ci_obj);
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+ return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+ const struct cl_lock_descr *d1)
+{
+ return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+ __diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+ const struct cl_lock_descr *d1)
+{
+ int ret;
+
+ ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+ if (ret)
+ return ret;
+ if (d0->cld_end < d1->cld_start)
+ return -1;
+ if (d0->cld_start > d0->cld_end)
+ return 1;
+ return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+ const struct cl_lock_descr *d1)
+{
+ d0->cld_start = min(d0->cld_start, d1->cld_start);
+ d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+ if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+ d0->cld_mode = CLM_WRITE;
+
+ if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+ d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+ int done = 0;
+
+ /* hidden treasure: bubble sort for now. */
+ do {
+ struct cl_io_lock_link *curr;
+ struct cl_io_lock_link *prev;
+ struct cl_io_lock_link *temp;
+
+ done = 1;
+ prev = NULL;
+
+ list_for_each_entry_safe(curr, temp,
+ &io->ci_lockset.cls_todo,
+ cill_linkage) {
+ if (prev != NULL) {
+ switch (cl_lock_descr_sort(&prev->cill_descr,
+ &curr->cill_descr)) {
+ case 0:
+ /*
+ * IMPOSSIBLE: Identical locks are
+ * already removed at
+ * this point.
+ */
+ default:
+ LBUG();
+ case +1:
+ list_move_tail(&curr->cill_linkage,
+ &prev->cill_linkage);
+ done = 0;
+ continue; /* don't change prev: it's
+ * still "previous" */
+ case -1: /* already in order */
+ break;
+ }
+ }
+ prev = curr;
+ }
+ } while (!done);
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval 0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+ const struct cl_lock_descr *need)
+{
+ struct cl_io_lock_link *scan;
+
+ list_for_each_entry(scan, queue, cill_linkage) {
+ if (cl_lock_descr_match(&scan->cill_descr, need))
+ return +1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+ const struct cl_lock_descr *need)
+{
+ struct cl_io_lock_link *scan;
+
+ list_for_each_entry(scan, queue, cill_linkage) {
+ if (cl_lock_descr_cmp(&scan->cill_descr, need))
+ continue;
+ cl_lock_descr_merge(&scan->cill_descr, need);
+ CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+ scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+ scan->cill_descr.cld_end);
+ return +1;
+ }
+ return 0;
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+ const struct cl_lock_descr *need)
+{
+ return cl_queue_match(&set->cls_curr, need) ||
+ cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+ const struct cl_lock_descr *need)
+{
+ return cl_queue_merge(&set->cls_todo, need) ||
+ cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+ struct cl_io *io, struct cl_lockset *set,
+ struct cl_io_lock_link *link)
+{
+ struct cl_lock *lock;
+ int result;
+
+ lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+ if (!IS_ERR(lock)) {
+ link->cill_lock = lock;
+ list_move(&link->cill_linkage, &set->cls_curr);
+ if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+ result = cl_wait(env, lock);
+ if (result == 0)
+ list_move(&link->cill_linkage,
+ &set->cls_done);
+ } else
+ result = 0;
+ } else
+ result = PTR_ERR(lock);
+ return result;
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+ struct cl_io_lock_link *link)
+{
+ struct cl_lock *lock = link->cill_lock;
+
+ list_del_init(&link->cill_linkage);
+ if (lock != NULL) {
+ cl_lock_release(env, lock, "io", io);
+ link->cill_lock = NULL;
+ }
+ if (link->cill_fini != NULL)
+ link->cill_fini(env, link);
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+ struct cl_lockset *set)
+{
+ struct cl_io_lock_link *link;
+ struct cl_io_lock_link *temp;
+ struct cl_lock *lock;
+ int result;
+
+ result = 0;
+ list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+ if (!cl_lockset_match(set, &link->cill_descr)) {
+ /* XXX some locking to guarantee that locks aren't
+ * expanded in between. */
+ result = cl_lockset_lock_one(env, io, set, link);
+ if (result != 0)
+ break;
+ } else
+ cl_lock_link_fini(env, io, link);
+ }
+ if (result == 0) {
+ list_for_each_entry_safe(link, temp,
+ &set->cls_curr, cill_linkage) {
+ lock = link->cill_lock;
+ result = cl_wait(env, lock);
+ if (result == 0)
+ list_move(&link->cill_linkage,
+ &set->cls_done);
+ else
+ break;
+ }
+ }
+ return result;
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_IT_STARTED);
+ LINVRNT(cl_io_invariant(io));
+
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+ continue;
+ result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+ if (result != 0)
+ break;
+ }
+ if (result == 0) {
+ cl_io_locks_sort(io);
+ result = cl_lockset_lock(env, io, &io->ci_lockset);
+ }
+ if (result != 0)
+ cl_io_unlock(env, io);
+ else
+ io->ci_state = CIS_LOCKED;
+ return result;
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+ struct cl_lockset *set;
+ struct cl_io_lock_link *link;
+ struct cl_io_lock_link *temp;
+ const struct cl_io_slice *scan;
+
+ LASSERT(cl_io_is_loopable(io));
+ LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+ LINVRNT(cl_io_invariant(io));
+
+ set = &io->ci_lockset;
+
+ list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+ cl_lock_link_fini(env, io, link);
+
+ list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+ cl_lock_link_fini(env, io, link);
+
+ list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+ cl_unuse(env, link->cill_lock);
+ cl_lock_link_fini(env, io, link);
+ }
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+ scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+ }
+ io->ci_state = CIS_UNLOCKED;
+ LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+ int result;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+ LINVRNT(cl_io_invariant(io));
+
+ result = 0;
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+ continue;
+ result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+ scan);
+ if (result != 0)
+ break;
+ }
+ if (result == 0)
+ io->ci_state = CIS_IT_STARTED;
+ return result;
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_UNLOCKED);
+ LINVRNT(cl_io_invariant(io));
+
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+ scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+ }
+ io->ci_state = CIS_IT_ENDED;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+ const struct cl_io_slice *scan;
+
+ LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+ nob == 0);
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(cl_io_invariant(io));
+
+ io->u.ci_rw.crw_pos += nob;
+ io->u.ci_rw.crw_count -= nob;
+
+ /* layers have to be notified. */
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+ scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+ nob);
+ }
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_io_lock_link *link)
+{
+ int result;
+
+ if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+ result = +1;
+ else {
+ list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+ result = 0;
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+ struct cl_io_lock_link *link)
+{
+ OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_lock_descr *descr)
+{
+ struct cl_io_lock_link *link;
+ int result;
+
+ OBD_ALLOC_PTR(link);
+ if (link != NULL) {
+ link->cill_descr = *descr;
+ link->cill_fini = cl_free_io_lock_link;
+ result = cl_io_lock_add(env, io, link);
+ if (result) /* lock match */
+ link->cill_fini(env, link);
+ } else
+ result = -ENOMEM;
+
+ return result;
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_io_invariant(io));
+
+ io->ci_state = CIS_IO_GOING;
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+ continue;
+ result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+ if (result != 0)
+ break;
+ }
+ if (result >= 0)
+ result = 0;
+ return result;
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_IO_GOING);
+ LINVRNT(cl_io_invariant(io));
+
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+ scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+ /* TODO: error handling. */
+ }
+ io->ci_state = CIS_IO_FINISHED;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+ const struct cl_page_slice *slice;
+
+ slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+ LINVRNT(slice != NULL);
+ return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+ int result = 1;
+ loff_t start;
+ loff_t end;
+ pgoff_t idx;
+
+ idx = page->cp_index;
+ switch (io->ci_type) {
+ case CIT_READ:
+ case CIT_WRITE:
+ /*
+ * check that [start, end) and [pos, pos + count) extents
+ * overlap.
+ */
+ if (!cl_io_is_append(io)) {
+ const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+ start = cl_offset(page->cp_obj, idx);
+ end = cl_offset(page->cp_obj, idx + 1);
+ result = crw->crw_pos < end &&
+ start < crw->crw_pos + crw->crw_count;
+ }
+ break;
+ case CIT_FAULT:
+ result = io->u.ci_fault.ft_index == idx;
+ break;
+ default:
+ LBUG();
+ }
+ return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page)
+{
+ const struct cl_io_slice *scan;
+ struct cl_2queue *queue;
+ int result = 0;
+
+ LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+ LINVRNT(cl_page_is_owned(page, io));
+ LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_page_in_io(page, io));
+ LINVRNT(cl_io_invariant(io));
+
+ queue = &io->ci_queue;
+
+ cl_2queue_init(queue);
+ /*
+ * ->cio_read_page() methods called in the loop below are supposed to
+ * never block waiting for network (the only subtle point is the
+ * creation of new pages for read-ahead that might result in cache
+ * shrinking, but currently only clean pages are shrunk and this
+ * requires no network io).
+ *
+ * Should this ever starts blocking, retry loop would be needed for
+ * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+ */
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->cio_read_page != NULL) {
+ const struct cl_page_slice *slice;
+
+ slice = cl_io_slice_page(scan, page);
+ LINVRNT(slice != NULL);
+ result = scan->cis_iop->cio_read_page(env, scan, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ if (result == 0)
+ result = cl_io_submit_rw(env, io, CRT_READ, queue);
+ /*
+ * Unlock unsent pages in case of error.
+ */
+ cl_page_list_disown(env, io, &queue->c2_qin);
+ cl_2queue_fini(env, queue);
+ return result;
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(io->ci_type == CIT_WRITE);
+ LINVRNT(cl_page_is_owned(page, io));
+ LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_io_invariant(io));
+ LASSERT(cl_page_in_io(page, io));
+
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->cio_prepare_write != NULL) {
+ const struct cl_page_slice *slice;
+
+ slice = cl_io_slice_page(scan, page);
+ result = scan->cis_iop->cio_prepare_write(env, scan,
+ slice,
+ from, to);
+ if (result != 0)
+ break;
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(io->ci_type == CIT_WRITE);
+ LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_io_invariant(io));
+ /*
+ * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+ * already called cl_page_cache_add(), moving page into CPS_CACHED
+ * state. Better (and more general) way of dealing with such situation
+ * is needed.
+ */
+ LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+ LASSERT(cl_page_in_io(page, io));
+
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->cio_commit_write != NULL) {
+ const struct cl_page_slice *slice;
+
+ slice = cl_io_slice_page(scan, page);
+ result = scan->cis_iop->cio_commit_write(env, scan,
+ slice,
+ from, to);
+ if (result != 0)
+ break;
+ }
+ }
+ LINVRNT(result <= 0);
+ return result;
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type crt, struct cl_2queue *queue)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+ continue;
+ result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+ queue);
+ if (result != 0)
+ break;
+ }
+ /*
+ * If ->cio_submit() failed, no pages were sent.
+ */
+ LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+ return result;
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type iot, struct cl_2queue *queue,
+ long timeout)
+{
+ struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+ struct cl_page *pg;
+ int rc;
+
+ cl_page_list_for_each(pg, &queue->c2_qin) {
+ LASSERT(pg->cp_sync_io == NULL);
+ pg->cp_sync_io = anchor;
+ }
+
+ cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+ rc = cl_io_submit_rw(env, io, iot, queue);
+ if (rc == 0) {
+ /*
+ * If some pages weren't sent for any reason (e.g.,
+ * read found up-to-date pages in the cache, or write found
+ * clean pages), count them as completed to avoid infinite
+ * wait.
+ */
+ cl_page_list_for_each(pg, &queue->c2_qin) {
+ pg->cp_sync_io = NULL;
+ cl_sync_io_note(anchor, +1);
+ }
+
+ /* wait for the IO to be finished. */
+ rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+ anchor, timeout);
+ } else {
+ LASSERT(list_empty(&queue->c2_qout.pl_pages));
+ cl_page_list_for_each(pg, &queue->c2_qin)
+ pg->cp_sync_io = NULL;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue)
+{
+ struct cl_page *page;
+ int result = 0;
+
+ CERROR("Canceling ongoing page transmission\n");
+ cl_page_list_for_each(page, queue) {
+ int rc;
+
+ LINVRNT(cl_page_in_io(page, io));
+ rc = cl_page_cancel(env, page);
+ result = result ?: rc;
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ * - cl_io_iter_init()
+ *
+ * - cl_io_lock()
+ *
+ * - cl_io_start()
+ *
+ * - cl_io_end()
+ *
+ * - cl_io_unlock()
+ *
+ * - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+ int result = 0;
+
+ LINVRNT(cl_io_is_loopable(io));
+
+ do {
+ size_t nob;
+
+ io->ci_continue = 0;
+ result = cl_io_iter_init(env, io);
+ if (result == 0) {
+ nob = io->ci_nob;
+ result = cl_io_lock(env, io);
+ if (result == 0) {
+ /*
+ * Notify layers that locks has been taken,
+ * and do actual i/o.
+ *
+ * - llite: kms, short read;
+ * - llite: generic_file_read();
+ */
+ result = cl_io_start(env, io);
+ /*
+ * Send any remaining pending
+ * io, etc.
+ *
+ * - llite: ll_rw_stats_tally.
+ */
+ cl_io_end(env, io);
+ cl_io_unlock(env, io);
+ cl_io_rw_advance(env, io, io->ci_nob - nob);
+ }
+ }
+ cl_io_iter_fini(env, io);
+ } while (result == 0 && io->ci_continue);
+ if (result == 0)
+ result = io->ci_result;
+ return result < 0 ? result : 0;
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+ struct cl_object *obj,
+ const struct cl_io_operations *ops)
+{
+ struct list_head *linkage = &slice->cis_linkage;
+
+ LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+ list_empty(linkage));
+
+ list_add_tail(linkage, &io->ci_layers);
+ slice->cis_io = io;
+ slice->cis_obj = obj;
+ slice->cis_iop = ops;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+ plist->pl_nr = 0;
+ INIT_LIST_HEAD(&plist->pl_pages);
+ plist->pl_owner = current;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+ /* it would be better to check that page is owned by "current" io, but
+ * it is not passed here. */
+ LASSERT(page->cp_owner != NULL);
+ LINVRNT(plist->pl_owner == current);
+
+ lockdep_off();
+ mutex_lock(&page->cp_mutex);
+ lockdep_on();
+ LASSERT(list_empty(&page->cp_batch));
+ list_add_tail(&page->cp_batch, &plist->pl_pages);
+ ++plist->pl_nr;
+ lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+ cl_page_get(page);
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+ struct cl_page_list *plist, struct cl_page *page)
+{
+ LASSERT(plist->pl_nr > 0);
+ LINVRNT(plist->pl_owner == current);
+
+ list_del_init(&page->cp_batch);
+ lockdep_off();
+ mutex_unlock(&page->cp_mutex);
+ lockdep_on();
+ --plist->pl_nr;
+ lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist);
+ cl_page_put(env, page);
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+ struct cl_page *page)
+{
+ LASSERT(src->pl_nr > 0);
+ LINVRNT(dst->pl_owner == current);
+ LINVRNT(src->pl_owner == current);
+
+ list_move_tail(&page->cp_batch, &dst->pl_pages);
+ --src->pl_nr;
+ ++dst->pl_nr;
+ lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+ src, dst);
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+ struct cl_page *page;
+ struct cl_page *tmp;
+
+ LINVRNT(list->pl_owner == current);
+ LINVRNT(head->pl_owner == current);
+
+ cl_page_list_for_each_safe(page, tmp, list)
+ cl_page_list_move(head, list, page);
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ struct cl_page *temp;
+
+ LINVRNT(plist->pl_owner == current);
+
+ cl_page_list_for_each_safe(page, temp, plist) {
+ LASSERT(plist->pl_nr > 0);
+
+ list_del_init(&page->cp_batch);
+ lockdep_off();
+ mutex_unlock(&page->cp_mutex);
+ lockdep_on();
+ --plist->pl_nr;
+ /*
+ * cl_page_disown0 rather than usual cl_page_disown() is used,
+ * because pages are possibly in CPS_FREEING state already due
+ * to the call to cl_page_list_discard().
+ */
+ /*
+ * XXX cl_page_disown0() will fail if page is not locked.
+ */
+ cl_page_disown0(env, io, page);
+ lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue",
+ plist);
+ cl_page_put(env, page);
+ }
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ struct cl_page *temp;
+
+ LINVRNT(plist->pl_owner == current);
+
+ cl_page_list_for_each_safe(page, temp, plist)
+ cl_page_list_del(env, plist, page);
+ LASSERT(plist->pl_nr == 0);
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ struct cl_page *temp;
+ pgoff_t index = 0;
+ int result;
+
+ LINVRNT(plist->pl_owner == current);
+
+ result = 0;
+ cl_page_list_for_each_safe(page, temp, plist) {
+ LASSERT(index <= page->cp_index);
+ index = page->cp_index;
+ if (cl_page_own(env, io, page) == 0)
+ result = result ?: page->cp_error;
+ else
+ cl_page_list_del(env, plist, page);
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+
+ LINVRNT(plist->pl_owner == current);
+
+ cl_page_list_for_each(page, plist)
+ cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *plist)
+{
+ struct cl_page *page;
+
+ LINVRNT(plist->pl_owner == current);
+ cl_page_list_for_each(page, plist)
+ cl_page_discard(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ int result;
+
+ LINVRNT(plist->pl_owner == current);
+ result = 0;
+ cl_page_list_for_each(page, plist) {
+ result = cl_page_unmap(env, io, page);
+ if (result != 0)
+ break;
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+ cl_page_list_init(&queue->c2_qin);
+ cl_page_list_init(&queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+ cl_page_list_add(&queue->c2_qin, page);
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue)
+{
+ cl_page_list_disown(env, io, &queue->c2_qin);
+ cl_page_list_disown(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue)
+{
+ cl_page_list_discard(env, io, &queue->c2_qin);
+ cl_page_list_discard(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue)
+{
+ cl_page_list_assume(env, io, &queue->c2_qin);
+ cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+ cl_page_list_fini(env, &queue->c2_qout);
+ cl_page_list_fini(env, &queue->c2_qin);
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+ cl_2queue_init(queue);
+ cl_2queue_add(queue, page);
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+ while (io->ci_parent != NULL)
+ io = io->ci_parent;
+ return io;
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+ struct cl_device *dev,
+ const struct cl_req_operations *ops)
+{
+ list_add_tail(&slice->crs_linkage, &req->crq_layers);
+ slice->crs_dev = dev;
+ slice->crs_ops = ops;
+ slice->crs_req = req;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+ unsigned i;
+
+ LASSERT(list_empty(&req->crq_pages));
+ LASSERT(req->crq_nrpages == 0);
+ LINVRNT(list_empty(&req->crq_layers));
+ LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+
+ if (req->crq_o != NULL) {
+ for (i = 0; i < req->crq_nrobjs; ++i) {
+ struct cl_object *obj = req->crq_o[i].ro_obj;
+ if (obj != NULL) {
+ lu_object_ref_del_at(&obj->co_lu,
+ &req->crq_o[i].ro_obj_ref,
+ "cl_req", req);
+ cl_object_put(env, obj);
+ }
+ }
+ OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof(req->crq_o[0]));
+ }
+ OBD_FREE_PTR(req);
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+ struct cl_page *page)
+{
+ struct cl_device *dev;
+ struct cl_page_slice *slice;
+ int result;
+
+ result = 0;
+ page = cl_page_top(page);
+ do {
+ list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+ dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+ if (dev->cd_ops->cdo_req_init != NULL) {
+ result = dev->cd_ops->cdo_req_init(env,
+ dev, req);
+ if (result != 0)
+ break;
+ }
+ }
+ page = page->cp_child;
+ } while (page != NULL && result == 0);
+ return result;
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+ struct cl_req_slice *slice;
+
+ /*
+ * for the lack of list_for_each_entry_reverse_safe()...
+ */
+ while (!list_empty(&req->crq_layers)) {
+ slice = list_entry(req->crq_layers.prev,
+ struct cl_req_slice, crs_linkage);
+ list_del_init(&slice->crs_linkage);
+ if (slice->crs_ops->cro_completion != NULL)
+ slice->crs_ops->cro_completion(env, slice, rc);
+ }
+ cl_req_free(env, req);
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+ enum cl_req_type crt, int nr_objects)
+{
+ struct cl_req *req;
+
+ LINVRNT(nr_objects > 0);
+
+ OBD_ALLOC_PTR(req);
+ if (req != NULL) {
+ int result;
+
+ req->crq_type = crt;
+ INIT_LIST_HEAD(&req->crq_pages);
+ INIT_LIST_HEAD(&req->crq_layers);
+
+ OBD_ALLOC(req->crq_o, nr_objects * sizeof(req->crq_o[0]));
+ if (req->crq_o != NULL) {
+ req->crq_nrobjs = nr_objects;
+ result = cl_req_init(env, req, page);
+ } else
+ result = -ENOMEM;
+ if (result != 0) {
+ cl_req_completion(env, req, result);
+ req = ERR_PTR(result);
+ }
+ } else
+ req = ERR_PTR(-ENOMEM);
+ return req;
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+ struct cl_req *req, struct cl_page *page)
+{
+ struct cl_object *obj;
+ struct cl_req_obj *rqo;
+ int i;
+
+ page = cl_page_top(page);
+
+ LASSERT(list_empty(&page->cp_flight));
+ LASSERT(page->cp_req == NULL);
+
+ CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+ req, req->crq_type, req->crq_nrpages);
+
+ list_add_tail(&page->cp_flight, &req->crq_pages);
+ ++req->crq_nrpages;
+ page->cp_req = req;
+ obj = cl_object_top(page->cp_obj);
+ for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+ if (rqo->ro_obj == NULL) {
+ rqo->ro_obj = obj;
+ cl_object_get(obj);
+ lu_object_ref_add_at(&obj->co_lu, &rqo->ro_obj_ref,
+ "cl_req", req);
+ break;
+ }
+ }
+ LASSERT(i < req->crq_nrobjs);
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+ struct cl_req *req = page->cp_req;
+
+ page = cl_page_top(page);
+
+ LASSERT(!list_empty(&page->cp_flight));
+ LASSERT(req->crq_nrpages > 0);
+
+ list_del_init(&page->cp_flight);
+ --req->crq_nrpages;
+ page->cp_req = NULL;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+ int i;
+ int result;
+ const struct cl_req_slice *slice;
+
+ /*
+ * Check that the caller of cl_req_alloc() didn't lie about the number
+ * of objects.
+ */
+ for (i = 0; i < req->crq_nrobjs; ++i)
+ LASSERT(req->crq_o[i].ro_obj != NULL);
+
+ result = 0;
+ list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+ if (slice->crs_ops->cro_prep != NULL) {
+ result = slice->crs_ops->cro_prep(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+ struct cl_req_attr *attr, u64 flags)
+{
+ const struct cl_req_slice *slice;
+ struct cl_page *page;
+ int i;
+
+ LASSERT(!list_empty(&req->crq_pages));
+
+ /* Take any page to use as a model. */
+ page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+ for (i = 0; i < req->crq_nrobjs; ++i) {
+ list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+ const struct cl_page_slice *scan;
+ const struct cl_object *obj;
+
+ scan = cl_page_at(page,
+ slice->crs_dev->cd_lu_dev.ld_type);
+ LASSERT(scan != NULL);
+ obj = scan->cpl_obj;
+ if (slice->crs_ops->cro_attr_set != NULL)
+ slice->crs_ops->cro_attr_set(env, slice, obj,
+ attr + i, flags);
+ }
+ }
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+ init_waitqueue_head(&anchor->csi_waitq);
+ atomic_set(&anchor->csi_sync_nr, nrpages);
+ atomic_set(&anchor->csi_barrier, nrpages > 0);
+ anchor->csi_sync_rc = 0;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue, struct cl_sync_io *anchor,
+ long timeout)
+{
+ struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+ NULL, NULL, NULL);
+ int rc;
+
+ LASSERT(timeout >= 0);
+
+ rc = l_wait_event(anchor->csi_waitq,
+ atomic_read(&anchor->csi_sync_nr) == 0,
+ &lwi);
+ if (rc < 0) {
+ CERROR("SYNC IO failed with error: %d, try to cancel %d remaining pages\n",
+ rc, atomic_read(&anchor->csi_sync_nr));
+
+ (void)cl_io_cancel(env, io, queue);
+
+ lwi = (struct l_wait_info) { 0 };
+ (void)l_wait_event(anchor->csi_waitq,
+ atomic_read(&anchor->csi_sync_nr) == 0,
+ &lwi);
+ } else {
+ rc = anchor->csi_sync_rc;
+ }
+ LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+ cl_page_list_assume(env, io, queue);
+
+ /* wait until cl_sync_io_note() has done wakeup */
+ while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+ cpu_relax();
+ }
+
+ POISON(anchor, 0x5a, sizeof(*anchor));
+ return rc;
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+ if (anchor->csi_sync_rc == 0 && ioret < 0)
+ anchor->csi_sync_rc = ioret;
+ /*
+ * Synchronous IO done without releasing page lock (e.g., as a part of
+ * ->{prepare,commit}_write(). Completion is used to signal the end of
+ * IO.
+ */
+ LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+ if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+ wake_up_all(&anchor->csi_waitq);
+ /* it's safe to nuke or reuse anchor now */
+ atomic_set(&anchor->csi_barrier, 0);
+ }
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644
index 000000000..b081167f9
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
@@ -0,0 +1,2239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include <linux/list.h>
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+ {
+ .ckd_cache = &cl_lock_kmem,
+ .ckd_name = "cl_lock_kmem",
+ .ckd_size = sizeof (struct cl_lock)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+ const struct cl_lock *lock)
+{
+ return ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+ atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+ lock->cll_holds >= lock->cll_users &&
+ lock->cll_holds >= 0 &&
+ lock->cll_users >= 0 &&
+ lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+ const struct cl_lock *lock)
+{
+ int result;
+
+ result = atomic_read(&lock->cll_ref) > 0 &&
+ cl_lock_invariant_trusted(env, lock);
+ if (!result && env != NULL)
+ CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+ return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+ return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+ const struct cl_lock *lock)
+{
+ struct cl_thread_info *info;
+ enum clt_nesting_level nesting;
+
+ info = cl_env_info(env);
+ nesting = cl_lock_nesting(lock);
+ LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+ return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+ const char *prefix, const struct cl_lock *lock,
+ const char *func, const int line)
+{
+ struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+ CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)(%p/%d/%d) at %s():%d\n",
+ prefix, lock, atomic_read(&lock->cll_ref),
+ lock->cll_guarder, lock->cll_depth,
+ lock->cll_state, lock->cll_error, lock->cll_holds,
+ lock->cll_users, lock->cll_flags,
+ env, h->coh_nesting, cl_lock_nr_mutexed(env),
+ func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock) \
+ cl_lock_trace0(level, env, prefix, lock, __func__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+ lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+ struct cl_lock *lock, __u32 enqflags)
+{
+ cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+ lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+ struct cl_lock *lock)
+{
+ cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+ lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+ struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+ struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+ struct cl_object *obj,
+ const struct cl_lock_operations *ops)
+{
+ slice->cls_lock = lock;
+ list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+ slice->cls_obj = obj;
+ slice->cls_ops = ops;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+ LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+ need == CLM_PHANTOM || need == CLM_GROUP);
+ LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+ has == CLM_PHANTOM || has == CLM_GROUP);
+ CLASSERT(CLM_PHANTOM < CLM_READ);
+ CLASSERT(CLM_READ < CLM_WRITE);
+ CLASSERT(CLM_WRITE < CLM_GROUP);
+
+ if (has != CLM_GROUP)
+ return need <= has;
+ else
+ return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need)
+{
+ return
+ has->cld_start <= need->cld_start &&
+ has->cld_end >= need->cld_end &&
+ cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+ (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need)
+{
+ return
+ cl_object_same(has->cld_obj, need->cld_obj) &&
+ cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_object *obj = lock->cll_descr.cld_obj;
+
+ LINVRNT(!cl_lock_is_mutexed(lock));
+
+ cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+ might_sleep();
+ while (!list_empty(&lock->cll_layers)) {
+ struct cl_lock_slice *slice;
+
+ slice = list_entry(lock->cll_layers.next,
+ struct cl_lock_slice, cls_linkage);
+ list_del_init(lock->cll_layers.next);
+ slice->cls_ops->clo_fini(env, slice);
+ }
+ CS_LOCK_DEC(obj, total);
+ CS_LOCKSTATE_DEC(obj, lock->cll_state);
+ lu_object_ref_del_at(&obj->co_lu, &lock->cll_obj_ref, "cl_lock", lock);
+ cl_object_put(env, obj);
+ lu_ref_fini(&lock->cll_reference);
+ lu_ref_fini(&lock->cll_holders);
+ mutex_destroy(&lock->cll_guard);
+ OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_object *obj;
+
+ LINVRNT(cl_lock_invariant(env, lock));
+ obj = lock->cll_descr.cld_obj;
+ LINVRNT(obj != NULL);
+
+ CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+ atomic_read(&lock->cll_ref), lock, RETIP);
+
+ if (atomic_dec_and_test(&lock->cll_ref)) {
+ if (lock->cll_state == CLS_FREEING) {
+ LASSERT(list_empty(&lock->cll_linkage));
+ cl_lock_free(env, lock);
+ }
+ CS_LOCK_DEC(obj, busy);
+ }
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_invariant(NULL, lock));
+ CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+ atomic_read(&lock->cll_ref), lock, RETIP);
+ atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+ CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+ atomic_read(&lock->cll_ref), lock, RETIP);
+ if (atomic_inc_return(&lock->cll_ref) == 1)
+ CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+ cl_lock_mutex_get(env, lock);
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+ struct cl_object *obj,
+ const struct cl_io *io,
+ const struct cl_lock_descr *descr)
+{
+ struct cl_lock *lock;
+ struct lu_object_header *head;
+
+ OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, GFP_NOFS);
+ if (lock != NULL) {
+ atomic_set(&lock->cll_ref, 1);
+ lock->cll_descr = *descr;
+ lock->cll_state = CLS_NEW;
+ cl_object_get(obj);
+ lu_object_ref_add_at(&obj->co_lu, &lock->cll_obj_ref, "cl_lock",
+ lock);
+ INIT_LIST_HEAD(&lock->cll_layers);
+ INIT_LIST_HEAD(&lock->cll_linkage);
+ INIT_LIST_HEAD(&lock->cll_inclosure);
+ lu_ref_init(&lock->cll_reference);
+ lu_ref_init(&lock->cll_holders);
+ mutex_init(&lock->cll_guard);
+ lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+ init_waitqueue_head(&lock->cll_wq);
+ head = obj->co_lu.lo_header;
+ CS_LOCKSTATE_INC(obj, CLS_NEW);
+ CS_LOCK_INC(obj, total);
+ CS_LOCK_INC(obj, create);
+ cl_lock_lockdep_init(lock);
+ list_for_each_entry(obj, &head->loh_layers,
+ co_lu.lo_linkage) {
+ int err;
+
+ err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+ if (err != 0) {
+ cl_lock_finish(env, lock);
+ lock = ERR_PTR(err);
+ break;
+ }
+ }
+ } else
+ lock = ERR_PTR(-ENOMEM);
+ return lock;
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+ struct cl_lock *lock)
+{
+ enum cl_lock_state state = lock->cll_state;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+ LASSERT(state != CLS_INTRANSIT);
+ LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+ "Malformed lock state %d.\n", state);
+
+ cl_lock_state_set(env, lock, CLS_INTRANSIT);
+ lock->cll_intransit_owner = current;
+ cl_lock_hold_add(env, lock, "intransit", current);
+ return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ * Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state)
+{
+ LASSERT(cl_lock_is_mutexed(lock));
+ LASSERT(lock->cll_state == CLS_INTRANSIT);
+ LASSERT(state != CLS_INTRANSIT);
+ LASSERT(lock->cll_intransit_owner == current);
+
+ lock->cll_intransit_owner = NULL;
+ cl_lock_state_set(env, lock, state);
+ cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+ LASSERT(cl_lock_is_mutexed(lock));
+ return lock->cll_state == CLS_INTRANSIT &&
+ lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock *lock,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io)
+{
+ const struct cl_lock_slice *slice;
+
+ LINVRNT(cl_lock_invariant_trusted(env, lock));
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_fits_into != NULL &&
+ !slice->cls_ops->clo_fits_into(env, slice, need, io))
+ return 0;
+ }
+ return 1;
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+ struct cl_object *obj,
+ const struct cl_io *io,
+ const struct cl_lock_descr *need)
+{
+ struct cl_lock *lock;
+ struct cl_object_header *head;
+
+ head = cl_object_header(obj);
+ assert_spin_locked(&head->coh_lock_guard);
+ CS_LOCK_INC(obj, lookup);
+ list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+ int matched;
+
+ matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+ lock->cll_state < CLS_FREEING &&
+ lock->cll_error == 0 &&
+ !(lock->cll_flags & CLF_CANCELLED) &&
+ cl_lock_fits_into(env, lock, need, io);
+ CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+ PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+ matched);
+ if (matched) {
+ cl_lock_get_trust(lock);
+ CS_LOCK_INC(obj, hit);
+ return lock;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+ const struct cl_io *io,
+ const struct cl_lock_descr *need)
+{
+ struct cl_object_header *head;
+ struct cl_object *obj;
+ struct cl_lock *lock;
+
+ obj = need->cld_obj;
+ head = cl_object_header(obj);
+
+ spin_lock(&head->coh_lock_guard);
+ lock = cl_lock_lookup(env, obj, io, need);
+ spin_unlock(&head->coh_lock_guard);
+
+ if (lock == NULL) {
+ lock = cl_lock_alloc(env, obj, io, need);
+ if (!IS_ERR(lock)) {
+ struct cl_lock *ghost;
+
+ spin_lock(&head->coh_lock_guard);
+ ghost = cl_lock_lookup(env, obj, io, need);
+ if (ghost == NULL) {
+ cl_lock_get_trust(lock);
+ list_add_tail(&lock->cll_linkage,
+ &head->coh_locks);
+ spin_unlock(&head->coh_lock_guard);
+ CS_LOCK_INC(obj, busy);
+ } else {
+ spin_unlock(&head->coh_lock_guard);
+ /*
+ * Other threads can acquire references to the
+ * top-lock through its sub-locks. Hence, it
+ * cannot be cl_lock_free()-ed immediately.
+ */
+ cl_lock_finish(env, lock);
+ lock = ghost;
+ }
+ }
+ }
+ return lock;
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_object_header *head;
+ struct cl_object *obj;
+ struct cl_lock *lock;
+
+ obj = need->cld_obj;
+ head = cl_object_header(obj);
+
+ do {
+ spin_lock(&head->coh_lock_guard);
+ lock = cl_lock_lookup(env, obj, io, need);
+ spin_unlock(&head->coh_lock_guard);
+ if (lock == NULL)
+ return NULL;
+
+ cl_lock_mutex_get(env, lock);
+ if (lock->cll_state == CLS_INTRANSIT)
+ /* Don't care return value. */
+ cl_lock_state_wait(env, lock);
+ if (lock->cll_state == CLS_FREEING) {
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+ lock = NULL;
+ }
+ } while (lock == NULL);
+
+ cl_lock_hold_add(env, lock, scope, source);
+ cl_lock_user_add(env, lock);
+ if (lock->cll_state == CLS_CACHED)
+ cl_use_try(env, lock, 1);
+ if (lock->cll_state == CLS_HELD) {
+ cl_lock_mutex_put(env, lock);
+ cl_lock_lockdep_acquire(env, lock, 0);
+ cl_lock_put(env, lock);
+ } else {
+ cl_unuse_try(env, lock);
+ cl_lock_unhold(env, lock, scope, source);
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+ lock = NULL;
+ }
+
+ return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+ const struct lu_device_type *dtype)
+{
+ const struct cl_lock_slice *slice;
+
+ LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+ return slice;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_thread_counters *counters;
+
+ counters = cl_lock_counters(env, lock);
+ lock->cll_depth++;
+ counters->ctc_nr_locks_locked++;
+ lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+ cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ if (lock->cll_guarder == current) {
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(lock->cll_depth > 0);
+ } else {
+ struct cl_object_header *hdr;
+ struct cl_thread_info *info;
+ int i;
+
+ LINVRNT(lock->cll_guarder != current);
+ hdr = cl_object_header(lock->cll_descr.cld_obj);
+ /*
+ * Check that mutices are taken in the bottom-to-top order.
+ */
+ info = cl_env_info(env);
+ for (i = 0; i < hdr->coh_nesting; ++i)
+ LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+ mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+ lock->cll_guarder = current;
+ LINVRNT(lock->cll_depth == 0);
+ }
+ cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+
+ LINVRNT(cl_lock_invariant_trusted(env, lock));
+
+ result = 0;
+ if (lock->cll_guarder == current) {
+ LINVRNT(lock->cll_depth > 0);
+ cl_lock_mutex_tail(env, lock);
+ } else if (mutex_trylock(&lock->cll_guard)) {
+ LINVRNT(lock->cll_depth == 0);
+ lock->cll_guarder = current;
+ cl_lock_mutex_tail(env, lock);
+ } else
+ result = -EBUSY;
+ return result;
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_thread_counters *counters;
+
+ LINVRNT(cl_lock_invariant(env, lock));
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(lock->cll_guarder == current);
+ LINVRNT(lock->cll_depth > 0);
+
+ counters = cl_lock_counters(env, lock);
+ LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+ cl_lock_trace(D_TRACE, env, "put mutex", lock);
+ lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+ counters->ctc_nr_locks_locked--;
+ if (--lock->cll_depth == 0) {
+ lock->cll_guarder = NULL;
+ mutex_unlock(&lock->cll_guard);
+ }
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+ return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+ struct cl_thread_info *info;
+ int i;
+ int locked;
+
+ /*
+ * NOTE: if summation across all nesting levels (currently 2) proves
+ * too expensive, a summary counter can be added to
+ * struct cl_thread_info.
+ */
+ info = cl_env_info(env);
+ for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+ locked += info->clt_counters[i].ctc_nr_locks_locked;
+ return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ if (!(lock->cll_flags & CLF_CANCELLED)) {
+ const struct cl_lock_slice *slice;
+
+ lock->cll_flags |= CLF_CANCELLED;
+ list_for_each_entry_reverse(slice, &lock->cll_layers,
+ cls_linkage) {
+ if (slice->cls_ops->clo_cancel != NULL)
+ slice->cls_ops->clo_cancel(env, slice);
+ }
+ }
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_object_header *head;
+ const struct cl_lock_slice *slice;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ if (lock->cll_state < CLS_FREEING) {
+ bool in_cache;
+
+ LASSERT(lock->cll_state != CLS_INTRANSIT);
+ cl_lock_state_set(env, lock, CLS_FREEING);
+
+ head = cl_object_header(lock->cll_descr.cld_obj);
+
+ spin_lock(&head->coh_lock_guard);
+ in_cache = !list_empty(&lock->cll_linkage);
+ if (in_cache)
+ list_del_init(&lock->cll_linkage);
+ spin_unlock(&head->coh_lock_guard);
+
+ if (in_cache) /* coh_locks cache holds a refcount. */
+ cl_lock_put(env, lock);
+
+ /*
+ * From now on, no new references to this lock can be acquired
+ * by cl_lock_lookup().
+ */
+ list_for_each_entry_reverse(slice, &lock->cll_layers,
+ cls_linkage) {
+ if (slice->cls_ops->clo_delete != NULL)
+ slice->cls_ops->clo_delete(env, slice);
+ }
+ /*
+ * From now on, no new references to this lock can be acquired
+ * by layer-specific means (like a pointer from struct
+ * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+ * lov).
+ *
+ * Lock will be finally freed in cl_lock_put() when last of
+ * existing references goes away.
+ */
+ }
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+ int delta)
+{
+ struct cl_thread_counters *counters;
+ enum clt_nesting_level nesting;
+
+ lock->cll_holds += delta;
+ nesting = cl_lock_nesting(lock);
+ if (nesting == CNL_TOP) {
+ counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+ counters->ctc_nr_held += delta;
+ LASSERT(counters->ctc_nr_held >= 0);
+ }
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+ int delta)
+{
+ struct cl_thread_counters *counters;
+ enum clt_nesting_level nesting;
+
+ lock->cll_users += delta;
+ nesting = cl_lock_nesting(lock);
+ if (nesting == CNL_TOP) {
+ counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+ counters->ctc_nr_used += delta;
+ LASSERT(counters->ctc_nr_used >= 0);
+ }
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_holds > 0);
+
+ cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+ lu_ref_del(&lock->cll_holders, scope, source);
+ cl_lock_hold_mod(env, lock, -1);
+ if (lock->cll_holds == 0) {
+ CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+ if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+ lock->cll_descr.cld_mode == CLM_GROUP ||
+ lock->cll_state != CLS_CACHED)
+ /*
+ * If lock is still phantom or grouplock when user is
+ * done with it---destroy the lock.
+ */
+ lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+ if (lock->cll_flags & CLF_CANCELPEND) {
+ lock->cll_flags &= ~CLF_CANCELPEND;
+ cl_lock_cancel0(env, lock);
+ }
+ if (lock->cll_flags & CLF_DOOMED) {
+ /* no longer doomed: it's dead... Jim. */
+ lock->cll_flags &= ~CLF_DOOMED;
+ cl_lock_delete0(env, lock);
+ }
+ }
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+ wait_queue_t waiter;
+ sigset_t blocked;
+ int result;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_depth == 1);
+ LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+ cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+ result = lock->cll_error;
+ if (result == 0) {
+ /* To avoid being interrupted by the 'non-fatal' signals
+ * (SIGCHLD, for instance), we'd block them temporarily.
+ * LU-305 */
+ blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+ init_waitqueue_entry(&waiter, current);
+ add_wait_queue(&lock->cll_wq, &waiter);
+ set_current_state(TASK_INTERRUPTIBLE);
+ cl_lock_mutex_put(env, lock);
+
+ LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+ /* Returning ERESTARTSYS instead of EINTR so syscalls
+ * can be restarted if signals are pending here */
+ result = -ERESTARTSYS;
+ if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+ schedule();
+ if (!cfs_signal_pending())
+ result = 0;
+ }
+
+ cl_lock_mutex_get(env, lock);
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&lock->cll_wq, &waiter);
+
+ /* Restore old blocked signals */
+ cfs_restore_sigs(blocked);
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state)
+{
+ const struct cl_lock_slice *slice;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+ if (slice->cls_ops->clo_state != NULL)
+ slice->cls_ops->clo_state(env, slice, state);
+ wake_up_all(&lock->cll_wq);
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+ cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+ cl_lock_state_signal(env, lock, lock->cll_state);
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state)
+{
+ LASSERT(lock->cll_state <= state ||
+ (lock->cll_state == CLS_CACHED &&
+ (state == CLS_HELD || /* lock found in cache */
+ state == CLS_NEW || /* sub-lock canceled */
+ state == CLS_INTRANSIT)) ||
+ /* lock is in transit state */
+ lock->cll_state == CLS_INTRANSIT);
+
+ if (lock->cll_state != state) {
+ CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+ CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+ cl_lock_state_signal(env, lock, state);
+ lock->cll_state = state;
+ }
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+
+ do {
+ result = 0;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+ result = -ENOSYS;
+ list_for_each_entry_reverse(slice, &lock->cll_layers,
+ cls_linkage) {
+ if (slice->cls_ops->clo_unuse != NULL) {
+ result = slice->cls_ops->clo_unuse(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+ } while (result == CLO_REPEAT);
+
+ return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ * use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+ enum cl_lock_state state;
+
+ cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+ LASSERT(lock->cll_state == CLS_CACHED);
+ if (lock->cll_error)
+ return lock->cll_error;
+
+ result = -ENOSYS;
+ state = cl_lock_intransit(env, lock);
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_use != NULL) {
+ result = slice->cls_ops->clo_use(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+
+ LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+ lock->cll_state);
+
+ if (result == 0) {
+ state = CLS_HELD;
+ } else {
+ if (result == -ESTALE) {
+ /*
+ * ESTALE means sublock being cancelled
+ * at this time, and set lock state to
+ * be NEW here and ask the caller to repeat.
+ */
+ state = CLS_NEW;
+ result = CLO_REPEAT;
+ }
+
+ /* @atomic means back-off-on-failure. */
+ if (atomic) {
+ int rc;
+ rc = cl_unuse_try_internal(env, lock);
+ /* Vet the results. */
+ if (rc < 0 && result > 0)
+ result = rc;
+ }
+
+ }
+ cl_lock_extransit(env, lock, state);
+ return result;
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+ struct cl_lock *lock,
+ struct cl_io *io, __u32 flags)
+{
+ int result;
+ const struct cl_lock_slice *slice;
+
+ result = -ENOSYS;
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_enqueue != NULL) {
+ result = slice->cls_ops->clo_enqueue(env,
+ slice, io, flags);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+ return result;
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ * lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 flags)
+{
+ int result;
+
+ cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+ do {
+ LINVRNT(cl_lock_is_mutexed(lock));
+
+ result = lock->cll_error;
+ if (result != 0)
+ break;
+
+ switch (lock->cll_state) {
+ case CLS_NEW:
+ cl_lock_state_set(env, lock, CLS_QUEUING);
+ /* fall-through */
+ case CLS_QUEUING:
+ /* kick layers. */
+ result = cl_enqueue_kick(env, lock, io, flags);
+ /* For AGL case, the cl_lock::cll_state may
+ * become CLS_HELD already. */
+ if (result == 0 && lock->cll_state == CLS_QUEUING)
+ cl_lock_state_set(env, lock, CLS_ENQUEUED);
+ break;
+ case CLS_INTRANSIT:
+ LASSERT(cl_lock_is_intransit(lock));
+ result = CLO_WAIT;
+ break;
+ case CLS_CACHED:
+ /* yank lock from the cache. */
+ result = cl_use_try(env, lock, 0);
+ break;
+ case CLS_ENQUEUED:
+ case CLS_HELD:
+ result = 0;
+ break;
+ default:
+ case CLS_FREEING:
+ /*
+ * impossible, only held locks with increased
+ * ->cll_holds can be enqueued, and they cannot be
+ * freed.
+ */
+ LBUG();
+ }
+ } while (result == CLO_REPEAT);
+ return result;
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+ struct cl_lock *lock,
+ int keep_mutex)
+{
+ struct cl_lock *conflict;
+ int rc = 0;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+ LASSERT(lock->cll_state == CLS_QUEUING);
+ LASSERT(lock->cll_conflict != NULL);
+
+ conflict = lock->cll_conflict;
+ lock->cll_conflict = NULL;
+
+ cl_lock_mutex_put(env, lock);
+ LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+ cl_lock_mutex_get(env, conflict);
+ cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+ cl_lock_cancel(env, conflict);
+ cl_lock_delete(env, conflict);
+
+ while (conflict->cll_state != CLS_FREEING) {
+ rc = cl_lock_state_wait(env, conflict);
+ if (rc != 0)
+ break;
+ }
+ cl_lock_mutex_put(env, conflict);
+ lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+ cl_lock_put(env, conflict);
+
+ if (keep_mutex)
+ cl_lock_mutex_get(env, lock);
+
+ LASSERT(rc <= 0);
+ return rc;
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 enqflags)
+{
+ int result;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_holds > 0);
+
+ cl_lock_user_add(env, lock);
+ do {
+ result = cl_enqueue_try(env, lock, io, enqflags);
+ if (result == CLO_WAIT) {
+ if (lock->cll_conflict != NULL)
+ result = cl_lock_enqueue_wait(env, lock, 1);
+ else
+ result = cl_lock_state_wait(env, lock);
+ if (result == 0)
+ continue;
+ }
+ break;
+ } while (1);
+ if (result != 0)
+ cl_unuse_try(env, lock);
+ LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+ lock->cll_state == CLS_ENQUEUED ||
+ lock->cll_state == CLS_HELD));
+ return result;
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ * lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 enqflags)
+{
+ int result;
+
+ cl_lock_lockdep_acquire(env, lock, enqflags);
+ cl_lock_mutex_get(env, lock);
+ result = cl_enqueue_locked(env, lock, io, enqflags);
+ cl_lock_mutex_put(env, lock);
+ if (result != 0)
+ cl_lock_lockdep_release(env, lock);
+ LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ lock->cll_state == CLS_HELD));
+ return result;
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+ enum cl_lock_state state = CLS_NEW;
+
+ cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+ if (lock->cll_users > 1) {
+ cl_lock_user_del(env, lock);
+ return 0;
+ }
+
+ /* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+ * underlying resources. */
+ if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+ cl_lock_user_del(env, lock);
+ return 0;
+ }
+
+ /*
+ * New lock users (->cll_users) are not protecting unlocking
+ * from proceeding. From this point, lock eventually reaches
+ * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+ * CLS_FREEING.
+ */
+ state = cl_lock_intransit(env, lock);
+
+ result = cl_unuse_try_internal(env, lock);
+ LASSERT(lock->cll_state == CLS_INTRANSIT);
+ LASSERT(result != CLO_WAIT);
+ cl_lock_user_del(env, lock);
+ if (result == 0 || result == -ESTALE) {
+ /*
+ * Return lock back to the cache. This is the only
+ * place where lock is moved into CLS_CACHED state.
+ *
+ * If one of ->clo_unuse() methods returned -ESTALE, lock
+ * cannot be placed into cache and has to be
+ * re-initialized. This happens e.g., when a sub-lock was
+ * canceled while unlocking was in progress.
+ */
+ if (state == CLS_HELD && result == 0)
+ state = CLS_CACHED;
+ else
+ state = CLS_NEW;
+ cl_lock_extransit(env, lock, state);
+
+ /*
+ * Hide -ESTALE error.
+ * If the lock is a glimpse lock, and it has multiple
+ * stripes. Assuming that one of its sublock returned -ENAVAIL,
+ * and other sublocks are matched write locks. In this case,
+ * we can't set this lock to error because otherwise some of
+ * its sublocks may not be canceled. This causes some dirty
+ * pages won't be written to OSTs. -jay
+ */
+ result = 0;
+ } else {
+ CERROR("result = %d, this is unlikely!\n", result);
+ state = CLS_NEW;
+ cl_lock_extransit(env, lock, state);
+ }
+ return result ?: lock->cll_error;
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+
+ result = cl_unuse_try(env, lock);
+ if (result)
+ CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+ cl_lock_mutex_get(env, lock);
+ cl_unuse_locked(env, lock);
+ cl_lock_mutex_put(env, lock);
+ cl_lock_lockdep_release(env, lock);
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+
+ cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+ do {
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERTF(lock->cll_state == CLS_QUEUING ||
+ lock->cll_state == CLS_ENQUEUED ||
+ lock->cll_state == CLS_HELD ||
+ lock->cll_state == CLS_INTRANSIT,
+ "lock state: %d\n", lock->cll_state);
+ LASSERT(lock->cll_users > 0);
+ LASSERT(lock->cll_holds > 0);
+
+ result = lock->cll_error;
+ if (result != 0)
+ break;
+
+ if (cl_lock_is_intransit(lock)) {
+ result = CLO_WAIT;
+ break;
+ }
+
+ if (lock->cll_state == CLS_HELD)
+ /* nothing to do */
+ break;
+
+ result = -ENOSYS;
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_wait != NULL) {
+ result = slice->cls_ops->clo_wait(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+ if (result == 0) {
+ LASSERT(lock->cll_state != CLS_INTRANSIT);
+ cl_lock_state_set(env, lock, CLS_HELD);
+ }
+ } while (result == CLO_REPEAT);
+ return result;
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ * lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+
+ cl_lock_mutex_get(env, lock);
+
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+ "Wrong state %d \n", lock->cll_state);
+ LASSERT(lock->cll_holds > 0);
+
+ do {
+ result = cl_wait_try(env, lock);
+ if (result == CLO_WAIT) {
+ result = cl_lock_state_wait(env, lock);
+ if (result == 0)
+ continue;
+ }
+ break;
+ } while (1);
+ if (result < 0) {
+ cl_unuse_try(env, lock);
+ cl_lock_lockdep_release(env, lock);
+ }
+ cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+ cl_lock_mutex_put(env, lock);
+ LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+ return result;
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ unsigned long pound;
+ unsigned long ounce;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ pound = 0;
+ list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_weigh != NULL) {
+ ounce = slice->cls_ops->clo_weigh(env, slice);
+ pound += ounce;
+ if (pound < ounce) /* over-weight^Wflow */
+ pound = ~0UL;
+ }
+ }
+ return pound;
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+ const struct cl_lock_descr *desc)
+{
+ const struct cl_lock_slice *slice;
+ struct cl_object *obj = lock->cll_descr.cld_obj;
+ struct cl_object_header *hdr = cl_object_header(obj);
+ int result;
+
+ cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+ /* don't allow object to change */
+ LASSERT(obj == desc->cld_obj);
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_modify != NULL) {
+ result = slice->cls_ops->clo_modify(env, slice, desc);
+ if (result != 0)
+ return result;
+ }
+ }
+ CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+ PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+ /*
+ * Just replace description in place. Nothing more is needed for
+ * now. If locks were indexed according to their extent and/or mode,
+ * that index would have to be updated here.
+ */
+ spin_lock(&hdr->coh_lock_guard);
+ lock->cll_descr = *desc;
+ spin_unlock(&hdr->coh_lock_guard);
+ return 0;
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+ struct cl_lock_closure *closure,
+ struct cl_lock *origin, int wait)
+{
+ LINVRNT(cl_lock_is_mutexed(origin));
+ LINVRNT(cl_lock_invariant(env, origin));
+
+ INIT_LIST_HEAD(&closure->clc_list);
+ closure->clc_origin = origin;
+ closure->clc_wait = wait;
+ closure->clc_nr = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+
+ LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+ LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+ result = cl_lock_enclosure(env, lock, closure);
+ if (result == 0) {
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_closure != NULL) {
+ result = slice->cls_ops->clo_closure(env, slice,
+ closure);
+ if (result != 0)
+ break;
+ }
+ }
+ }
+ if (result != 0)
+ cl_lock_disclosure(env, closure);
+ return result;
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure)
+{
+ int result = 0;
+
+ cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+ if (!cl_lock_mutex_try(env, lock)) {
+ /*
+ * If lock->cll_inclosure is not empty, lock is already in
+ * this closure.
+ */
+ if (list_empty(&lock->cll_inclosure)) {
+ cl_lock_get_trust(lock);
+ lu_ref_add(&lock->cll_reference, "closure", closure);
+ list_add(&lock->cll_inclosure, &closure->clc_list);
+ closure->clc_nr++;
+ } else
+ cl_lock_mutex_put(env, lock);
+ result = 0;
+ } else {
+ cl_lock_disclosure(env, closure);
+ if (closure->clc_wait) {
+ cl_lock_get_trust(lock);
+ lu_ref_add(&lock->cll_reference, "closure-w", closure);
+ cl_lock_mutex_put(env, closure->clc_origin);
+
+ LASSERT(cl_lock_nr_mutexed(env) == 0);
+ cl_lock_mutex_get(env, lock);
+ cl_lock_mutex_put(env, lock);
+
+ cl_lock_mutex_get(env, closure->clc_origin);
+ lu_ref_del(&lock->cll_reference, "closure-w", closure);
+ cl_lock_put(env, lock);
+ }
+ result = CLO_REPEAT;
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+ struct cl_lock_closure *closure)
+{
+ struct cl_lock *scan;
+ struct cl_lock *temp;
+
+ cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+ list_for_each_entry_safe(scan, temp, &closure->clc_list,
+ cll_inclosure){
+ list_del_init(&scan->cll_inclosure);
+ cl_lock_mutex_put(env, scan);
+ lu_ref_del(&scan->cll_reference, "closure", closure);
+ cl_lock_put(env, scan);
+ closure->clc_nr--;
+ }
+ LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+ LASSERT(closure->clc_nr == 0);
+ LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ * cl_lock_nr_mutexed(env) == 1)
+ * [i.e., if a top-lock is deleted, mutices of no other locks can be
+ * held, as deletion of sub-locks might require releasing a top-lock
+ * mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+ cl_lock_nr_mutexed(env) == 1));
+
+ cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+ if (lock->cll_holds == 0)
+ cl_lock_delete0(env, lock);
+ else
+ lock->cll_flags |= CLF_DOOMED;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ if (lock->cll_error == 0 && error != 0) {
+ cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+ lock->cll_error = error;
+ cl_lock_signal(env, lock);
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+ }
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+ if (lock->cll_holds == 0)
+ cl_lock_cancel0(env, lock);
+ else
+ lock->cll_flags |= CLF_CANCELPEND;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+ struct cl_object *obj, pgoff_t index,
+ struct cl_lock *except,
+ int pending, int canceld)
+{
+ struct cl_object_header *head;
+ struct cl_lock *scan;
+ struct cl_lock *lock;
+ struct cl_lock_descr *need;
+
+ head = cl_object_header(obj);
+ need = &cl_env_info(env)->clt_descr;
+ lock = NULL;
+
+ need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+ * not PHANTOM */
+ need->cld_start = need->cld_end = index;
+ need->cld_enq_flags = 0;
+
+ spin_lock(&head->coh_lock_guard);
+ /* It is fine to match any group lock since there could be only one
+ * with a uniq gid and it conflicts with all other lock modes too */
+ list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+ if (scan != except &&
+ (scan->cll_descr.cld_mode == CLM_GROUP ||
+ cl_lock_ext_match(&scan->cll_descr, need)) &&
+ scan->cll_state >= CLS_HELD &&
+ scan->cll_state < CLS_FREEING &&
+ /*
+ * This check is racy as the lock can be canceled right
+ * after it is done, but this is fine, because page exists
+ * already.
+ */
+ (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+ (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+ /* Don't increase cs_hit here since this
+ * is just a helper function. */
+ cl_lock_get_trust(scan);
+ lock = scan;
+ break;
+ }
+ }
+ spin_unlock(&head->coh_lock_guard);
+ return lock;
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+ struct lu_device_type *dtype;
+ const struct cl_page_slice *slice;
+
+ dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+ slice = cl_page_at(page, dtype);
+ LASSERT(slice != NULL);
+ return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+ struct cl_lock *lock = cbdata;
+ pgoff_t index = pgoff_at_lock(page, lock);
+
+ if (index >= info->clt_fn_index) {
+ struct cl_lock *tmp;
+
+ /* refresh non-overlapped index */
+ tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+ lock, 1, 0);
+ if (tmp != NULL) {
+ /* Cache the first-non-overlapped index so as to skip
+ * all pages within [index, clt_fn_index). This
+ * is safe because if tmp lock is canceled, it will
+ * discard these pages. */
+ info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+ if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+ info->clt_fn_index = CL_PAGE_EOF;
+ cl_lock_put(env, tmp);
+ } else if (cl_page_own(env, io, page) == 0) {
+ /* discard the page */
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ } else {
+ LASSERT(page->cp_state == CPS_FREEING);
+ }
+ }
+
+ info->clt_next_index = index + 1;
+ return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+ struct cl_lock *lock = cbdata;
+
+ LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+ KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+ !PageWriteback(cl_page_vmpage(env, page))));
+ KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+ !PageDirty(cl_page_vmpage(env, page))));
+
+ info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+ if (cl_page_own(env, io, page) == 0) {
+ /* discard the page */
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ } else {
+ LASSERT(page->cp_state == CPS_FREEING);
+ }
+
+ return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+ struct cl_io *io = &info->clt_io;
+ struct cl_lock_descr *descr = &lock->cll_descr;
+ cl_page_gang_cb_t cb;
+ int res;
+ int result;
+
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ io->ci_obj = cl_object_top(descr->cld_obj);
+ io->ci_ignore_layout = 1;
+ result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+ if (result != 0)
+ goto out;
+
+ cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+ info->clt_fn_index = info->clt_next_index = descr->cld_start;
+ do {
+ res = cl_page_gang_lookup(env, descr->cld_obj, io,
+ info->clt_next_index, descr->cld_end,
+ cb, (void *)lock);
+ if (info->clt_next_index > descr->cld_end)
+ break;
+
+ if (res == CLP_GANG_RESCHED)
+ cond_resched();
+ } while (res != CLP_GANG_OKAY);
+out:
+ cl_io_fini(env, io);
+ return result;
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ * destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+ struct cl_object_header *head;
+ struct cl_lock *lock;
+
+ head = cl_object_header(obj);
+ /*
+ * If locks are destroyed without cancellation, all pages must be
+ * already destroyed (as otherwise they will be left unprotected).
+ */
+ LASSERT(ergo(!cancel,
+ head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+ spin_lock(&head->coh_lock_guard);
+ while (!list_empty(&head->coh_locks)) {
+ lock = container_of(head->coh_locks.next,
+ struct cl_lock, cll_linkage);
+ cl_lock_get_trust(lock);
+ spin_unlock(&head->coh_lock_guard);
+ lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+ cl_lock_mutex_get(env, lock);
+ if (lock->cll_state < CLS_FREEING) {
+ LASSERT(lock->cll_users <= 1);
+ if (unlikely(lock->cll_users == 1)) {
+ struct l_wait_info lwi = { 0 };
+
+ cl_lock_mutex_put(env, lock);
+ l_wait_event(lock->cll_wq,
+ lock->cll_users == 0,
+ &lwi);
+ goto again;
+ }
+
+ if (cancel)
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+ }
+ cl_lock_mutex_put(env, lock);
+ lu_ref_del(&lock->cll_reference, "prune", current);
+ cl_lock_put(env, lock);
+ spin_lock(&head->coh_lock_guard);
+ }
+ spin_unlock(&head->coh_lock_guard);
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+ const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_lock *lock;
+
+ while (1) {
+ lock = cl_lock_find(env, io, need);
+ if (IS_ERR(lock))
+ break;
+ cl_lock_mutex_get(env, lock);
+ if (lock->cll_state < CLS_FREEING &&
+ !(lock->cll_flags & CLF_CANCELLED)) {
+ cl_lock_hold_mod(env, lock, +1);
+ lu_ref_add(&lock->cll_holders, scope, source);
+ lu_ref_add(&lock->cll_reference, scope, source);
+ break;
+ }
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+ }
+ return lock;
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_lock *lock;
+
+ lock = cl_lock_hold_mutex(env, io, need, scope, source);
+ if (!IS_ERR(lock))
+ cl_lock_mutex_put(env, lock);
+ return lock;
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_lock *lock;
+ int rc;
+ __u32 enqflags = need->cld_enq_flags;
+
+ do {
+ lock = cl_lock_hold_mutex(env, io, need, scope, source);
+ if (IS_ERR(lock))
+ break;
+
+ rc = cl_enqueue_locked(env, lock, io, enqflags);
+ if (rc == 0) {
+ if (cl_lock_fits_into(env, lock, need, io)) {
+ if (!(enqflags & CEF_AGL)) {
+ cl_lock_mutex_put(env, lock);
+ cl_lock_lockdep_acquire(env, lock,
+ enqflags);
+ break;
+ }
+ rc = 1;
+ }
+ cl_unuse_locked(env, lock);
+ }
+ cl_lock_trace(D_DLMTRACE, env,
+ rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+ cl_lock_hold_release(env, lock, scope, source);
+ cl_lock_mutex_put(env, lock);
+ lu_ref_del(&lock->cll_reference, scope, source);
+ cl_lock_put(env, lock);
+ if (rc > 0) {
+ LASSERT(enqflags & CEF_AGL);
+ lock = NULL;
+ } else if (rc != 0) {
+ lock = ERR_PTR(rc);
+ }
+ } while (rc == 0);
+ return lock;
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_state != CLS_FREEING);
+
+ cl_lock_hold_mod(env, lock, +1);
+ cl_lock_get(lock);
+ lu_ref_add(&lock->cll_holders, scope, source);
+ lu_ref_add(&lock->cll_reference, scope, source);
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_invariant(env, lock));
+ cl_lock_hold_release(env, lock, scope, source);
+ lu_ref_del(&lock->cll_reference, scope, source);
+ cl_lock_put(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_invariant(env, lock));
+ cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+ cl_lock_mutex_get(env, lock);
+ cl_lock_hold_release(env, lock, scope, source);
+ cl_lock_mutex_put(env, lock);
+ lu_ref_del(&lock->cll_reference, scope, source);
+ cl_lock_put(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ cl_lock_used_mod(env, lock, +1);
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_users > 0);
+
+ cl_lock_used_mod(env, lock, -1);
+ if (lock->cll_users == 0)
+ wake_up_all(&lock->cll_wq);
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+ static const char *names[] = {
+ [CLM_PHANTOM] = "P",
+ [CLM_READ] = "R",
+ [CLM_WRITE] = "W",
+ [CLM_GROUP] = "G"
+ };
+ if (0 <= mode && mode < ARRAY_SIZE(names))
+ return names[mode];
+ else
+ return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_lock_descr *descr)
+{
+ const struct lu_fid *fid;
+
+ fid = lu_object_fid(&descr->cld_obj->co_lu);
+ (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+ lock, atomic_read(&lock->cll_ref),
+ lock->cll_state, lock->cll_error, lock->cll_holds,
+ lock->cll_users, lock->cll_flags);
+ cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+ (*printer)(env, cookie, " {\n");
+
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ (*printer)(env, cookie, " %s@%p: ",
+ slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+ slice);
+ if (slice->cls_ops->clo_print != NULL)
+ slice->cls_ops->clo_print(env, cookie, printer, slice);
+ (*printer)(env, cookie, "\n");
+ }
+ (*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+ return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+ lu_kmem_fini(cl_lock_caches);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644
index 000000000..f13d1fbff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c
@@ -0,0 +1,1139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ * i_mutex
+ * PG_locked
+ * ->coh_page_guard
+ * ->coh_lock_guard
+ * ->coh_attr_guard
+ * ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+/* class_put_type() */
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_fid.h"
+#include <linux/list.h>
+#include "../../include/linux/libcfs/libcfs_hash.h" /* for cfs_hash stuff */
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+ int result;
+
+ result = lu_object_header_init(&h->coh_lu);
+ if (result == 0) {
+ spin_lock_init(&h->coh_page_guard);
+ spin_lock_init(&h->coh_lock_guard);
+ spin_lock_init(&h->coh_attr_guard);
+ lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+ lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+ lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+ h->coh_pages = 0;
+ /* XXX hard coded GFP_* mask. */
+ INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+ INIT_LIST_HEAD(&h->coh_locks);
+ h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+ LASSERT(list_empty(&h->coh_locks));
+ lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+ struct cl_device *cd, const struct lu_fid *fid,
+ const struct cl_object_conf *c)
+{
+ might_sleep();
+ return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+ lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+ lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+ struct cl_object_header *hdr = cl_object_header(o);
+ struct cl_object *top;
+
+ while (hdr->coh_parent != NULL)
+ hdr = hdr->coh_parent;
+
+ top = lu2cl(lu_object_top(&hdr->coh_lu));
+ CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+ return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+ return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+ __acquires(cl_object_attr_guard(o))
+{
+ spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+ __releases(cl_object_attr_guard(o))
+{
+ spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr)
+{
+ struct lu_object_header *top;
+ int result;
+
+ assert_spin_locked(cl_object_attr_guard(obj));
+
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+ if (obj->co_ops->coo_attr_get != NULL) {
+ result = obj->co_ops->coo_attr_get(env, obj, attr);
+ if (result != 0) {
+ if (result > 0)
+ result = 0;
+ break;
+ }
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned v)
+{
+ struct lu_object_header *top;
+ int result;
+
+ assert_spin_locked(cl_object_attr_guard(obj));
+
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry_reverse(obj, &top->loh_layers,
+ co_lu.lo_linkage) {
+ if (obj->co_ops->coo_attr_set != NULL) {
+ result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+ if (result != 0) {
+ if (result > 0)
+ result = 0;
+ break;
+ }
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+ struct ost_lvb *lvb)
+{
+ struct lu_object_header *top;
+ int result;
+
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry_reverse(obj, &top->loh_layers,
+ co_lu.lo_linkage) {
+ if (obj->co_ops->coo_glimpse != NULL) {
+ result = obj->co_ops->coo_glimpse(env, obj, lvb);
+ if (result != 0)
+ break;
+ }
+ }
+ LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+ "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu\n",
+ lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+ lvb->lvb_ctime, lvb->lvb_blocks);
+ return result;
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf)
+{
+ struct lu_object_header *top;
+ int result;
+
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+ if (obj->co_ops->coo_conf_set != NULL) {
+ result = obj->co_ops->coo_conf_set(env, obj, conf);
+ if (result != 0)
+ break;
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+ struct cl_object_header *hdr;
+
+ hdr = cl_object_header(obj);
+ LASSERT(hdr->coh_tree.rnode == NULL);
+ LASSERT(hdr->coh_pages == 0);
+
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+ /*
+ * Destroy all locks. Object destruction (including cl_inode_fini())
+ * cannot cancel the locks, because in the case of a local client,
+ * where client and server share the same thread running
+ * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+ * waiting on __wait_on_freeing_inode().
+ */
+ cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+ cl_pages_prune(env, obj);
+ cl_locks_prune(env, obj, 1);
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+ struct cl_object_header *head = cl_object_header(obj);
+ int has;
+
+ spin_lock(&head->coh_lock_guard);
+ has = list_empty(&head->coh_locks);
+ spin_unlock(&head->coh_lock_guard);
+
+ return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+ int i;
+
+ cs->cs_name = name;
+ for (i = 0; i < CS_NR; i++)
+ atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h)
+{
+ int i;
+ /*
+ * lookup hit total cached create
+ * env: ...... ...... ...... ...... ......
+ */
+ if (h) {
+ const char *names[CS_NR] = CS_NAMES;
+
+ seq_printf(m, "%6s", " ");
+ for (i = 0; i < CS_NR; i++)
+ seq_printf(m, "%8s", names[i]);
+ seq_printf(m, "\n");
+ }
+
+ seq_printf(m, "%5.5s:", cs->cs_name);
+ for (i = 0; i < CS_NR; i++)
+ seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+ return 0;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+ int i;
+ int result;
+
+ result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+ if (result == 0) {
+ cache_stats_init(&s->cs_pages, "pages");
+ cache_stats_init(&s->cs_locks, "locks");
+ for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+ atomic_set(&s->cs_pages_state[0], 0);
+ for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+ atomic_set(&s->cs_locks_state[i], 0);
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+ lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+ .cs_name = "envs",
+ .cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+ int i;
+ static const char *pstate[] = {
+ [CPS_CACHED] = "c",
+ [CPS_OWNED] = "o",
+ [CPS_PAGEOUT] = "w",
+ [CPS_PAGEIN] = "r",
+ [CPS_FREEING] = "f"
+ };
+ static const char *lstate[] = {
+ [CLS_NEW] = "n",
+ [CLS_QUEUING] = "q",
+ [CLS_ENQUEUED] = "e",
+ [CLS_HELD] = "h",
+ [CLS_INTRANSIT] = "t",
+ [CLS_CACHED] = "c",
+ [CLS_FREEING] = "f"
+ };
+/*
+ lookup hit total busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+ env: ...... ...... ...... ...... ......
+ */
+ lu_site_stats_print(&site->cs_lu, m);
+ cache_stats_print(&site->cs_pages, m, 1);
+ seq_printf(m, " [");
+ for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+ seq_printf(m, "%s: %u ", pstate[i],
+ atomic_read(&site->cs_pages_state[i]));
+ seq_printf(m, "]\n");
+ cache_stats_print(&site->cs_locks, m, 0);
+ seq_printf(m, " [");
+ for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+ seq_printf(m, "%s: %u ", lstate[i],
+ atomic_read(&site->cs_locks_state[i]));
+ seq_printf(m, "]\n");
+ cache_stats_print(&cl_env_stats, m, 0);
+ seq_printf(m, "\n");
+ return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ * - cl_env: for liblustre.
+ * - tux_info: only on RedHat kernel.
+ * - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+ void *ce_magic;
+ struct lu_env ce_lu;
+ struct lu_context ce_ses;
+
+ /**
+ * This allows cl_env to be entered into cl_env_hash which implements
+ * the current thread -> client environment lookup.
+ */
+ struct hlist_node ce_node;
+ /**
+ * Owner for the current cl_env.
+ *
+ * If LL_TASK_CL_ENV is defined, this point to the owning current,
+ * only for debugging purpose ;
+ * Otherwise hash is used, and this is the key for cfs_hash.
+ * Now current thread pid is stored. Note using thread pointer would
+ * lead to unbalanced hash because of its specific allocation locality
+ * and could be varied for different platforms and OSes, even different
+ * OS versions.
+ */
+ void *ce_owner;
+
+ /*
+ * Linkage into global list of all client environments. Used for
+ * garbage collection.
+ */
+ struct list_head ce_linkage;
+ /*
+ *
+ */
+ int ce_ref;
+ /*
+ * Debugging field: address of the caller who made original
+ * allocation.
+ */
+ void *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+ LASSERT(cle->ce_ref == 0);
+ LASSERT(cle->ce_magic == &cl_env_init0);
+ LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+ cle->ce_ref = 1;
+ cle->ce_debug = debug;
+ CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static struct cfs_hash *cl_env_hash;
+
+static unsigned cl_env_hops_hash(struct cfs_hash *lh,
+ const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+ return cfs_hash_u64_hash((__u64)key, mask);
+#else
+ return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+ struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+ LASSERT(cle->ce_magic == &cl_env_init0);
+ return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+ struct cl_env *cle = cl_env_hops_obj(hn);
+
+ LASSERT(cle->ce_owner != NULL);
+ return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(struct cfs_hash *hs, struct hlist_node *hn)
+{
+ struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+ LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+ .hs_hash = cl_env_hops_hash,
+ .hs_key = cl_env_hops_obj,
+ .hs_keycmp = cl_env_hops_keycmp,
+ .hs_object = cl_env_hops_obj,
+ .hs_get = cl_env_hops_noop,
+ .hs_put_locked = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+ struct cl_env *cle;
+
+ cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+ LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+ return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+ if (cle) {
+ int rc;
+
+ LASSERT(cle->ce_owner == NULL);
+ cle->ce_owner = (void *) (long) current->pid;
+ rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+ &cle->ce_node);
+ LASSERT(rc == 0);
+ }
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+ void *cookie;
+
+ LASSERT(cle->ce_owner == (void *) (long) current->pid);
+ cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+ &cle->ce_node);
+ LASSERT(cookie == cle);
+ cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+ cl_env_hash = cfs_hash_create("cl_env",
+ HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+ HASH_CL_ENV_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &cl_env_hops,
+ CFS_HASH_RW_BKTLOCK);
+ return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void)
+{
+ cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+ if (cle == NULL)
+ cle = cl_env_fetch();
+
+ if (cle && cle->ce_owner)
+ cl_env_do_detach(cle);
+
+ return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+ struct lu_env *env;
+ struct cl_env *cle;
+
+ OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, GFP_NOFS);
+ if (cle != NULL) {
+ int rc;
+
+ INIT_LIST_HEAD(&cle->ce_linkage);
+ cle->ce_magic = &cl_env_init0;
+ env = &cle->ce_lu;
+ rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+ if (rc == 0) {
+ rc = lu_context_init(&cle->ce_ses,
+ LCT_SESSION | ses_tags);
+ if (rc == 0) {
+ lu_context_enter(&cle->ce_ses);
+ env->le_ses = &cle->ce_ses;
+ cl_env_init0(cle, debug);
+ } else
+ lu_env_fini(env);
+ }
+ if (rc != 0) {
+ OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+ env = ERR_PTR(rc);
+ } else {
+ CL_ENV_INC(create);
+ CL_ENV_INC(total);
+ }
+ } else
+ env = ERR_PTR(-ENOMEM);
+ return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+ CL_ENV_DEC(total);
+ lu_context_fini(&cle->ce_lu.le_ctx);
+ lu_context_fini(&cle->ce_ses);
+ OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+ return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+ struct lu_env *env;
+ struct cl_env *cle;
+
+ CL_ENV_INC(lookup);
+
+ /* check that we don't go far from untrusted pointer */
+ CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+ env = NULL;
+ cle = cl_env_fetch();
+ if (cle != NULL) {
+ CL_ENV_INC(hit);
+ env = &cle->ce_lu;
+ *refcheck = ++cle->ce_ref;
+ }
+ CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+ return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+ struct lu_env *env;
+
+ env = cl_env_peek(refcheck);
+ if (env == NULL) {
+ env = cl_env_new(lu_context_tags_default,
+ lu_session_tags_default,
+ __builtin_return_address(0));
+
+ if (!IS_ERR(env)) {
+ struct cl_env *cle;
+
+ cle = cl_env_container(env);
+ cl_env_attach(cle);
+ *refcheck = cle->ce_ref;
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+ }
+ }
+ return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+ struct lu_env *env;
+
+ LASSERT(cl_env_peek(refcheck) == NULL);
+ env = cl_env_new(tags, tags, __builtin_return_address(0));
+ if (!IS_ERR(env)) {
+ struct cl_env *cle;
+
+ cle = cl_env_container(env);
+ *refcheck = cle->ce_ref;
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+ }
+ return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+ LASSERT(cle->ce_owner == NULL);
+ lu_context_exit(&cle->ce_lu.le_ctx);
+ lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+ struct cl_env *cle;
+
+ cle = cl_env_container(env);
+
+ LASSERT(cle->ce_ref > 0);
+ LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+ if (--cle->ce_ref == 0) {
+ CL_ENV_DEC(busy);
+ cl_env_detach(cle);
+ cle->ce_debug = NULL;
+ cl_env_exit(cle);
+ cl_env_fini(cle);
+ }
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+ return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+ cl_env_detach(NULL);
+ cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+ struct cl_env *cle = cl_env_container(env);
+
+ LASSERT(cle->ce_ref > 0);
+
+ cl_env_attach(cle);
+ cl_env_get(refcheck);
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+ struct cl_env *cle = cl_env_container(env);
+
+ LASSERT(cle->ce_ref > 1);
+
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+ cl_env_detach(cle);
+ cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+ struct lu_env *env;
+
+ nest->cen_cookie = NULL;
+ env = cl_env_peek(&nest->cen_refcheck);
+ if (env != NULL) {
+ if (!cl_io_is_going(env))
+ return env;
+ else {
+ cl_env_put(env, &nest->cen_refcheck);
+ nest->cen_cookie = cl_env_reenter();
+ }
+ }
+ env = cl_env_get(&nest->cen_refcheck);
+ if (IS_ERR(env)) {
+ cl_env_reexit(nest->cen_cookie);
+ return env;
+ }
+
+ LASSERT(!cl_io_is_going(env));
+ return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+ cl_env_put(env, &nest->cen_refcheck);
+ cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+ lvb->lvb_size = attr->cat_size;
+ lvb->lvb_mtime = attr->cat_mtime;
+ lvb->lvb_atime = attr->cat_atime;
+ lvb->lvb_ctime = attr->cat_ctime;
+ lvb->lvb_blocks = attr->cat_blocks;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+ attr->cat_size = lvb->lvb_size;
+ attr->cat_mtime = lvb->lvb_mtime;
+ attr->cat_atime = lvb->lvb_atime;
+ attr->cat_ctime = lvb->lvb_ctime;
+ attr->cat_blocks = lvb->lvb_blocks;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+ struct lu_device_type *ldt,
+ struct lu_device *next)
+{
+ const char *typename;
+ struct lu_device *d;
+
+ LASSERT(ldt != NULL);
+
+ typename = ldt->ldt_name;
+ d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+ if (!IS_ERR(d)) {
+ int rc;
+
+ if (site != NULL)
+ d->ld_site = site;
+ rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+ if (rc == 0) {
+ lu_device_get(d);
+ lu_ref_add(&d->ld_reference,
+ "lu-stack", &lu_site_init);
+ } else {
+ ldt->ldt_ops->ldto_device_free(env, d);
+ CERROR("can't init device '%s', %d\n", typename, rc);
+ d = ERR_PTR(rc);
+ }
+ } else
+ CERROR("Cannot allocate device: '%s'\n", typename);
+ return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+ lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int cl_lock_init(void);
+void cl_lock_fini(void);
+
+int cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+ return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct cl_thread_info *info;
+
+ info = cl0_key_init(ctx, key);
+ if (!IS_ERR(info)) {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+ lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+ }
+ return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct cl_thread_info *info;
+ int i;
+
+ info = data;
+ for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+ lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+ cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct cl_thread_info *info = data;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+ LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+ LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+ LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+ LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+ lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+ lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+ }
+}
+
+static struct lu_context_key cl_key = {
+ .lct_tags = LCT_CL_THREAD,
+ .lct_init = cl_key_init,
+ .lct_fini = cl_key_fini,
+ .lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+ {
+ .ckd_cache = &cl_env_kmem,
+ .ckd_name = "cl_env_kmem",
+ .ckd_size = sizeof (struct cl_env)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+ int result;
+
+ result = cl_env_store_init();
+ if (result)
+ return result;
+
+ result = lu_kmem_init(cl_object_caches);
+ if (result)
+ goto out_store;
+
+ LU_CONTEXT_KEY_INIT(&cl_key);
+ result = lu_context_key_register(&cl_key);
+ if (result)
+ goto out_kmem;
+
+ result = cl_lock_init();
+ if (result)
+ goto out_context;
+
+ result = cl_page_init();
+ if (result)
+ goto out_lock;
+
+ return 0;
+out_lock:
+ cl_lock_fini();
+out_context:
+ lu_context_key_degister(&cl_key);
+out_kmem:
+ lu_kmem_fini(cl_object_caches);
+out_store:
+ cl_env_store_fini();
+ return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+ cl_lock_fini();
+ cl_page_fini();
+ lu_context_key_degister(&cl_key);
+ lu_kmem_fini(cl_object_caches);
+ cl_env_store_fini();
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644
index 000000000..b7dd04808
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c
@@ -0,0 +1,1553 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include <linux/list.h>
+
+#include "../include/cl_object.h"
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+ int radix);
+
+# define PASSERT(env, page, expr) \
+ do { \
+ if (unlikely(!(expr))) { \
+ CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \
+ LASSERT(0); \
+ } \
+ } while (0)
+
+# define PINVRNT(env, page, exp) \
+ ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+ while (page->cp_parent != NULL)
+ page = page->cp_parent;
+ return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+ LASSERT(atomic_read(&page->cp_ref) > 0);
+ atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+ const struct lu_device_type *dtype)
+{
+ const struct cl_page_slice *slice;
+
+ page = cl_page_top_trusted((struct cl_page *)page);
+ do {
+ list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+ if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+ return slice;
+ }
+ page = page->cp_child;
+ } while (page != NULL);
+ return NULL;
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+ struct cl_page *page;
+
+ assert_spin_locked(&hdr->coh_page_guard);
+
+ page = radix_tree_lookup(&hdr->coh_tree, index);
+ if (page != NULL)
+ cl_page_get_trust(page);
+ return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io, pgoff_t start, pgoff_t end,
+ cl_page_gang_cb_t cb, void *cbdata)
+{
+ struct cl_object_header *hdr;
+ struct cl_page *page;
+ struct cl_page **pvec;
+ const struct cl_page_slice *slice;
+ const struct lu_device_type *dtype;
+ pgoff_t idx;
+ unsigned int nr;
+ unsigned int i;
+ unsigned int j;
+ int res = CLP_GANG_OKAY;
+ int tree_lock = 1;
+
+ idx = start;
+ hdr = cl_object_header(obj);
+ pvec = cl_env_info(env)->clt_pvec;
+ dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+ spin_lock(&hdr->coh_page_guard);
+ while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+ idx, CLT_PVEC_SIZE)) > 0) {
+ int end_of_region = 0;
+ idx = pvec[nr - 1]->cp_index + 1;
+ for (i = 0, j = 0; i < nr; ++i) {
+ page = pvec[i];
+ pvec[i] = NULL;
+
+ LASSERT(page->cp_type == CPT_CACHEABLE);
+ if (page->cp_index > end) {
+ end_of_region = 1;
+ break;
+ }
+ if (page->cp_state == CPS_FREEING)
+ continue;
+
+ slice = cl_page_at_trusted(page, dtype);
+ /*
+ * Pages for lsm-less file has no underneath sub-page
+ * for osc, in case of ...
+ */
+ PASSERT(env, page, slice != NULL);
+
+ page = slice->cpl_page;
+ /*
+ * Can safely call cl_page_get_trust() under
+ * radix-tree spin-lock.
+ *
+ * XXX not true, because @page is from object another
+ * than @hdr and protected by different tree lock.
+ */
+ cl_page_get_trust(page);
+ lu_ref_add_atomic(&page->cp_reference,
+ "gang_lookup", current);
+ pvec[j++] = page;
+ }
+
+ /*
+ * Here a delicate locking dance is performed. Current thread
+ * holds a reference to a page, but has to own it before it
+ * can be placed into queue. Owning implies waiting, so
+ * radix-tree lock is to be released. After a wait one has to
+ * check that pages weren't truncated (cl_page_own() returns
+ * error in the latter case).
+ */
+ spin_unlock(&hdr->coh_page_guard);
+ tree_lock = 0;
+
+ for (i = 0; i < j; ++i) {
+ page = pvec[i];
+ if (res == CLP_GANG_OKAY)
+ res = (*cb)(env, io, page, cbdata);
+ lu_ref_del(&page->cp_reference,
+ "gang_lookup", current);
+ cl_page_put(env, page);
+ }
+ if (nr < CLT_PVEC_SIZE || end_of_region)
+ break;
+
+ if (res == CLP_GANG_OKAY && need_resched())
+ res = CLP_GANG_RESCHED;
+ if (res != CLP_GANG_OKAY)
+ break;
+
+ spin_lock(&hdr->coh_page_guard);
+ tree_lock = 1;
+ }
+ if (tree_lock)
+ spin_unlock(&hdr->coh_page_guard);
+ return res;
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+ struct cl_object *obj = page->cp_obj;
+ int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+ PASSERT(env, page, list_empty(&page->cp_batch));
+ PASSERT(env, page, page->cp_owner == NULL);
+ PASSERT(env, page, page->cp_req == NULL);
+ PASSERT(env, page, page->cp_parent == NULL);
+ PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+ might_sleep();
+ while (!list_empty(&page->cp_layers)) {
+ struct cl_page_slice *slice;
+
+ slice = list_entry(page->cp_layers.next,
+ struct cl_page_slice, cpl_linkage);
+ list_del_init(page->cp_layers.next);
+ slice->cpl_ops->cpo_fini(env, slice);
+ }
+ CS_PAGE_DEC(obj, total);
+ CS_PAGESTATE_DEC(obj, page->cp_state);
+ lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page);
+ cl_object_put(env, obj);
+ lu_ref_fini(&page->cp_reference);
+ OBD_FREE(page, pagesize);
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+ enum cl_page_state state)
+{
+ /* bypass const. */
+ *(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+ struct cl_object *o, pgoff_t ind, struct page *vmpage,
+ enum cl_page_type type)
+{
+ struct cl_page *page;
+ struct lu_object_header *head;
+
+ OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+ GFP_NOFS);
+ if (page != NULL) {
+ int result = 0;
+ atomic_set(&page->cp_ref, 1);
+ if (type == CPT_CACHEABLE) /* for radix tree */
+ atomic_inc(&page->cp_ref);
+ page->cp_obj = o;
+ cl_object_get(o);
+ lu_object_ref_add_at(&o->co_lu, &page->cp_obj_ref, "cl_page",
+ page);
+ page->cp_index = ind;
+ cl_page_state_set_trust(page, CPS_CACHED);
+ page->cp_type = type;
+ INIT_LIST_HEAD(&page->cp_layers);
+ INIT_LIST_HEAD(&page->cp_batch);
+ INIT_LIST_HEAD(&page->cp_flight);
+ mutex_init(&page->cp_mutex);
+ lu_ref_init(&page->cp_reference);
+ head = o->co_lu.lo_header;
+ list_for_each_entry(o, &head->loh_layers,
+ co_lu.lo_linkage) {
+ if (o->co_ops->coo_page_init != NULL) {
+ result = o->co_ops->coo_page_init(env, o,
+ page, vmpage);
+ if (result != 0) {
+ cl_page_delete0(env, page, 0);
+ cl_page_free(env, page);
+ page = ERR_PTR(result);
+ break;
+ }
+ }
+ }
+ if (result == 0) {
+ CS_PAGE_INC(o, total);
+ CS_PAGE_INC(o, create);
+ CS_PAGESTATE_DEC(o, CPS_CACHED);
+ }
+ } else {
+ page = ERR_PTR(-ENOMEM);
+ }
+ return page;
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+ struct cl_object *o,
+ pgoff_t idx, struct page *vmpage,
+ enum cl_page_type type,
+ struct cl_page *parent)
+{
+ struct cl_page *page = NULL;
+ struct cl_page *ghost = NULL;
+ struct cl_object_header *hdr;
+ int err;
+
+ LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+ might_sleep();
+
+ hdr = cl_object_header(o);
+ CS_PAGE_INC(o, lookup);
+
+ CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+ idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+ /* fast path. */
+ if (type == CPT_CACHEABLE) {
+ /* vmpage lock is used to protect the child/parent
+ * relationship */
+ KLASSERT(PageLocked(vmpage));
+ /*
+ * cl_vmpage_page() can be called here without any locks as
+ *
+ * - "vmpage" is locked (which prevents ->private from
+ * concurrent updates), and
+ *
+ * - "o" cannot be destroyed while current thread holds a
+ * reference on it.
+ */
+ page = cl_vmpage_page(vmpage, o);
+ PINVRNT(env, page,
+ ergo(page != NULL,
+ cl_page_vmpage(env, page) == vmpage &&
+ (void *)radix_tree_lookup(&hdr->coh_tree,
+ idx) == page));
+ }
+
+ if (page != NULL) {
+ CS_PAGE_INC(o, hit);
+ return page;
+ }
+
+ /* allocate and initialize cl_page */
+ page = cl_page_alloc(env, o, idx, vmpage, type);
+ if (IS_ERR(page))
+ return page;
+
+ if (type == CPT_TRANSIENT) {
+ if (parent) {
+ LASSERT(page->cp_parent == NULL);
+ page->cp_parent = parent;
+ parent->cp_child = page;
+ }
+ return page;
+ }
+
+ /*
+ * XXX optimization: use radix_tree_preload() here, and change tree
+ * gfp mask to GFP_KERNEL in cl_object_header_init().
+ */
+ spin_lock(&hdr->coh_page_guard);
+ err = radix_tree_insert(&hdr->coh_tree, idx, page);
+ if (err != 0) {
+ ghost = page;
+ /*
+ * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+ * from this race, but
+ *
+ * 0. it's better to have cl_page interface "locally
+ * consistent" so that its correctness can be reasoned
+ * about without appealing to the (obscure world of) VM
+ * locking.
+ *
+ * 1. handling this race allows ->coh_tree to remain
+ * consistent even when VM locking is somehow busted,
+ * which is very useful during diagnosing and debugging.
+ */
+ page = ERR_PTR(err);
+ CL_PAGE_DEBUG(D_ERROR, env, ghost,
+ "fail to insert into radix tree: %d\n", err);
+ } else {
+ if (parent) {
+ LASSERT(page->cp_parent == NULL);
+ page->cp_parent = parent;
+ parent->cp_child = page;
+ }
+ hdr->coh_pages++;
+ }
+ spin_unlock(&hdr->coh_page_guard);
+
+ if (unlikely(ghost != NULL)) {
+ cl_page_delete0(env, ghost, 0);
+ cl_page_free(env, ghost);
+ }
+ return page;
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+ pgoff_t idx, struct page *vmpage,
+ enum cl_page_type type)
+{
+ return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+ pgoff_t idx, struct page *vmpage,
+ struct cl_page *parent)
+{
+ return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+ struct cl_object_header *header;
+ struct cl_page *parent;
+ struct cl_page *child;
+ struct cl_io *owner;
+
+ /*
+ * Page invariant is protected by a VM lock.
+ */
+ LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+ header = cl_object_header(pg->cp_obj);
+ parent = pg->cp_parent;
+ child = pg->cp_child;
+ owner = pg->cp_owner;
+
+ return cl_page_in_use(pg) &&
+ ergo(parent != NULL, parent->cp_child == pg) &&
+ ergo(child != NULL, child->cp_parent == pg) &&
+ ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+ ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+ ergo(owner != NULL && parent != NULL,
+ parent->cp_owner == pg->cp_owner->ci_parent) &&
+ ergo(owner != NULL && child != NULL,
+ child->cp_owner->ci_parent == owner) &&
+ /*
+ * Either page is early in initialization (has neither child
+ * nor parent yet), or it is in the object radix tree.
+ */
+ ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+ (void *)radix_tree_lookup(&header->coh_tree,
+ pg->cp_index) == pg ||
+ (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+ struct cl_page *page, enum cl_page_state state)
+{
+ enum cl_page_state old;
+
+ /*
+ * Matrix of allowed state transitions [old][new], for sanity
+ * checking.
+ */
+ static const int allowed_transitions[CPS_NR][CPS_NR] = {
+ [CPS_CACHED] = {
+ [CPS_CACHED] = 0,
+ [CPS_OWNED] = 1, /* io finds existing cached page */
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 1, /* write-out from the cache */
+ [CPS_FREEING] = 1, /* eviction on the memory pressure */
+ },
+ [CPS_OWNED] = {
+ [CPS_CACHED] = 1, /* release to the cache */
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 1, /* start read immediately */
+ [CPS_PAGEOUT] = 1, /* start write immediately */
+ [CPS_FREEING] = 1, /* lock invalidation or truncate */
+ },
+ [CPS_PAGEIN] = {
+ [CPS_CACHED] = 1, /* io completion */
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 0,
+ [CPS_FREEING] = 0,
+ },
+ [CPS_PAGEOUT] = {
+ [CPS_CACHED] = 1, /* io completion */
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 0,
+ [CPS_FREEING] = 0,
+ },
+ [CPS_FREEING] = {
+ [CPS_CACHED] = 0,
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 0,
+ [CPS_FREEING] = 0,
+ }
+ };
+
+ old = page->cp_state;
+ PASSERT(env, page, allowed_transitions[old][state]);
+ CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+ for (; page != NULL; page = page->cp_child) {
+ PASSERT(env, page, page->cp_state == old);
+ PASSERT(env, page,
+ equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+ CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+ CS_PAGESTATE_INC(page->cp_obj, state);
+ cl_page_state_set_trust(page, state);
+ }
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+ struct cl_page *page, enum cl_page_state state)
+{
+ cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+ cl_page_get_trust(page);
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+ PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+ CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+ atomic_read(&page->cp_ref));
+
+ if (atomic_dec_and_test(&page->cp_ref)) {
+ LASSERT(page->cp_state == CPS_FREEING);
+
+ LASSERT(atomic_read(&page->cp_ref) == 0);
+ PASSERT(env, page, page->cp_owner == NULL);
+ PASSERT(env, page, list_empty(&page->cp_batch));
+ /*
+ * Page is no longer reachable by other threads. Tear
+ * it down.
+ */
+ cl_page_free(env, page);
+ }
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+ const struct cl_page_slice *slice;
+
+ /*
+ * Find uppermost layer with ->cpo_vmpage() method, and return its
+ * result.
+ */
+ page = cl_page_top(page);
+ do {
+ list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+ if (slice->cpl_ops->cpo_vmpage != NULL)
+ return slice->cpl_ops->cpo_vmpage(env, slice);
+ }
+ page = page->cp_child;
+ } while (page != NULL);
+ LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+ struct cl_page *top;
+ struct cl_page *page;
+
+ KLASSERT(PageLocked(vmpage));
+
+ /*
+ * NOTE: absence of races and liveness of data are guaranteed by page
+ * lock on a "vmpage". That works because object destruction has
+ * bottom-to-top pass.
+ */
+
+ /*
+ * This loop assumes that ->private points to the top-most page. This
+ * can be rectified easily.
+ */
+ top = (struct cl_page *)vmpage->private;
+ if (top == NULL)
+ return NULL;
+
+ for (page = top; page != NULL; page = page->cp_child) {
+ if (cl_object_same(page->cp_obj, obj)) {
+ cl_page_get_trust(page);
+ break;
+ }
+ }
+ LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+ return page;
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+ return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+ const struct lu_device_type *dtype)
+{
+ return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...) \
+({ \
+ const struct lu_env *__env = (_env); \
+ struct cl_page *__page = (_page); \
+ const struct cl_page_slice *__scan; \
+ int __result; \
+ ptrdiff_t __op = (_op); \
+ int (*__method)_proto; \
+ \
+ __result = 0; \
+ __page = cl_page_top(__page); \
+ do { \
+ list_for_each_entry(__scan, &__page->cp_layers, \
+ cpl_linkage) { \
+ __method = *(void **)((char *)__scan->cpl_ops + \
+ __op); \
+ if (__method != NULL) { \
+ __result = (*__method)(__env, __scan, \
+ ## __VA_ARGS__); \
+ if (__result != 0) \
+ break; \
+ } \
+ } \
+ __page = __page->cp_child; \
+ } while (__page != NULL && __result == 0); \
+ if (__result > 0) \
+ __result = 0; \
+ __result; \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...) \
+do { \
+ const struct lu_env *__env = (_env); \
+ struct cl_page *__page = (_page); \
+ const struct cl_page_slice *__scan; \
+ ptrdiff_t __op = (_op); \
+ void (*__method)_proto; \
+ \
+ __page = cl_page_top(__page); \
+ do { \
+ list_for_each_entry(__scan, &__page->cp_layers, \
+ cpl_linkage) { \
+ __method = *(void **)((char *)__scan->cpl_ops + \
+ __op); \
+ if (__method != NULL) \
+ (*__method)(__env, __scan, \
+ ## __VA_ARGS__); \
+ } \
+ __page = __page->cp_child; \
+ } while (__page != NULL); \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...) \
+do { \
+ const struct lu_env *__env = (_env); \
+ struct cl_page *__page = (_page); \
+ const struct cl_page_slice *__scan; \
+ ptrdiff_t __op = (_op); \
+ void (*__method)_proto; \
+ \
+ /* get to the bottom page. */ \
+ while (__page->cp_child != NULL) \
+ __page = __page->cp_child; \
+ do { \
+ list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+ cpl_linkage) { \
+ __method = *(void **)((char *)__scan->cpl_ops + \
+ __op); \
+ if (__method != NULL) \
+ (*__method)(__env, __scan, \
+ ## __VA_ARGS__); \
+ } \
+ __page = __page->cp_parent; \
+ } while (__page != NULL); \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+ PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+ return CL_PAGE_INVOKE(env, page, op,
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io);
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+ PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+ CL_PAGE_INVOID(env, page, op,
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *), io);
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+ for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+ if (page->cp_owner != NULL) {
+ LASSERT(page->cp_owner->ci_owned_nr > 0);
+ page->cp_owner->ci_owned_nr--;
+ page->cp_owner = NULL;
+ page->cp_task = NULL;
+ }
+ }
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+ for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+ LASSERT(page->cp_owner != NULL);
+ page->cp_owner->ci_owned_nr++;
+ }
+}
+
+void cl_page_disown0(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ enum cl_page_state state;
+
+ state = pg->cp_state;
+ PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ cl_page_owner_clear(pg);
+
+ if (state == CPS_OWNED)
+ cl_page_state_set(env, pg, CPS_CACHED);
+ /*
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for VFS/VM interaction runs
+ * last and can release locks safely.
+ */
+ CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io);
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+ LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+ return pg->cp_state == CPS_OWNED && pg->cp_owner == io;
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0 success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ * or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, int nonblock)
+{
+ int result;
+
+ PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+
+ if (pg->cp_state == CPS_FREEING) {
+ result = -ENOENT;
+ } else {
+ result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+ (const struct lu_env *,
+ const struct cl_page_slice *,
+ struct cl_io *, int),
+ io, nonblock);
+ if (result == 0) {
+ PASSERT(env, pg, pg->cp_owner == NULL);
+ PASSERT(env, pg, pg->cp_req == NULL);
+ pg->cp_owner = io;
+ pg->cp_task = current;
+ cl_page_owner_set(pg);
+ if (pg->cp_state != CPS_FREEING) {
+ cl_page_state_set(env, pg, CPS_OWNED);
+ } else {
+ cl_page_disown0(env, io, pg);
+ result = -ENOENT;
+ }
+ }
+ }
+ PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+ return result;
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+ return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg)
+{
+ return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+
+ cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+ PASSERT(env, pg, pg->cp_owner == NULL);
+ pg->cp_owner = io;
+ pg->cp_task = current;
+ cl_page_owner_set(pg);
+ cl_page_state_set(env, pg, CPS_OWNED);
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+ cl_page_owner_clear(pg);
+ cl_page_state_set(env, pg, CPS_CACHED);
+ CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io);
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+ cl_page_disown0(env, io, pg);
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+ int radix)
+{
+ struct cl_page *tmp = pg;
+
+ PASSERT(env, pg, pg == cl_page_top(pg));
+ PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+ /*
+ * Severe all ways to obtain new pointers to @pg.
+ */
+ cl_page_owner_clear(pg);
+
+ /*
+ * unexport the page firstly before freeing it so that
+ * the page content is considered to be invalid.
+ * We have to do this because a CPS_FREEING cl_page may
+ * be NOT under the protection of a cl_lock.
+ * Afterwards, if this page is found by other threads, then this
+ * page will be forced to reread.
+ */
+ cl_page_export(env, pg, 0);
+ cl_page_state_set0(env, pg, CPS_FREEING);
+
+ CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+ (const struct lu_env *, const struct cl_page_slice *));
+
+ if (tmp->cp_type == CPT_CACHEABLE) {
+ if (!radix)
+ /* !radix means that @pg is not yet in the radix tree,
+ * skip removing it.
+ */
+ tmp = pg->cp_child;
+ for (; tmp != NULL; tmp = tmp->cp_child) {
+ void *value;
+ struct cl_object_header *hdr;
+
+ hdr = cl_object_header(tmp->cp_obj);
+ spin_lock(&hdr->coh_page_guard);
+ value = radix_tree_delete(&hdr->coh_tree,
+ tmp->cp_index);
+ PASSERT(env, tmp, value == tmp);
+ PASSERT(env, tmp, hdr->coh_pages > 0);
+ hdr->coh_pages--;
+ spin_unlock(&hdr->coh_page_guard);
+ cl_page_put(env, tmp);
+ }
+ }
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ * - removes page from the radix trees,
+ *
+ * - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre pg == cl_page_top(pg)
+ * \pre VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ cl_page_delete0(env, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+ (const struct lu_env *,
+ const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+ int result;
+ const struct cl_page_slice *slice;
+
+ pg = cl_page_top_trusted((struct cl_page *)pg);
+ slice = container_of(pg->cp_layers.next,
+ const struct cl_page_slice, cpl_linkage);
+ PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+ /*
+ * Call ->cpo_is_vmlocked() directly instead of going through
+ * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+ * cl_page_invariant().
+ */
+ result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+ PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+ return result == -EBUSY;
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+ return crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN;
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+ struct cl_page *pg, enum cl_req_type crt)
+{
+ /*
+ * Page is queued for IO, change its state.
+ */
+ cl_page_owner_clear(pg);
+ cl_page_state_set(env, pg, cl_req_type_state(crt));
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt)
+{
+ int result;
+
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ PINVRNT(env, pg, crt < CRT_NR);
+
+ /*
+ * XXX this has to be called bottom-to-top, so that llite can set up
+ * PG_writeback without risking other layers deciding to skip this
+ * page.
+ */
+ if (crt >= CRT_NR)
+ return -EINVAL;
+ result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+ if (result == 0)
+ cl_page_io_start(env, pg, crt);
+
+ KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+ equi(result == 0,
+ PageWriteback(cl_page_vmpage(env, pg)))));
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+ return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+ struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+ struct cl_sync_io *anchor = pg->cp_sync_io;
+
+ PASSERT(env, pg, crt < CRT_NR);
+ /* cl_page::cp_req already cleared by the caller (osc_completion()) */
+ PASSERT(env, pg, pg->cp_req == NULL);
+ PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+ if (crt == CRT_READ && ioret == 0) {
+ PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+ pg->cp_flags |= CPF_READ_COMPLETED;
+ }
+
+ cl_page_state_set(env, pg, CPS_CACHED);
+ if (crt >= CRT_NR)
+ return;
+ CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+ (const struct lu_env *,
+ const struct cl_page_slice *, int), ioret);
+ if (anchor) {
+ LASSERT(cl_page_is_vmlocked(env, pg));
+ LASSERT(pg->cp_sync_io == anchor);
+ pg->cp_sync_io = NULL;
+ }
+ /*
+ * As page->cp_obj is pinned by a reference from page->cp_req, it is
+ * safe to call cl_page_put() without risking object destruction in a
+ * non-blocking context.
+ */
+ cl_page_put(env, pg);
+
+ if (anchor)
+ cl_sync_io_note(anchor, ioret);
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+ enum cl_req_type crt)
+{
+ int result;
+
+ PINVRNT(env, pg, crt < CRT_NR);
+
+ if (crt >= CRT_NR)
+ return -EINVAL;
+ result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+ (const struct lu_env *,
+ const struct cl_page_slice *));
+ if (result == 0) {
+ PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+ cl_page_io_start(env, pg, crt);
+ }
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+ return result;
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt)
+{
+ const struct cl_page_slice *scan;
+ int result = 0;
+
+ PINVRNT(env, pg, crt < CRT_NR);
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ if (crt >= CRT_NR)
+ return -EINVAL;
+
+ list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+ if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+ continue;
+
+ result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+ if (result != 0)
+ break;
+ }
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+ return result;
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg)
+{
+ int result;
+
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+ return result;
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page)
+{
+ int rc;
+
+ PINVRNT(env, page, cl_page_invariant(page));
+
+ rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io);
+ PASSERT(env, page, rc != 0);
+ return rc;
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ cl_page_own(env, io, page);
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+ struct cl_thread_info *info;
+ struct cl_object *obj = cl_object_top(clobj);
+ struct cl_io *io;
+ int result;
+
+ info = cl_env_info(env);
+ io = &info->clt_io;
+
+ /*
+ * initialize the io. This is ugly since we never do IO in this
+ * function, we just make cl_page_list functions happy. -jay
+ */
+ io->ci_obj = obj;
+ io->ci_ignore_layout = 1;
+ result = cl_io_init(env, io, CIT_MISC, obj);
+ if (result != 0) {
+ cl_io_fini(env, io);
+ return io->ci_result;
+ }
+
+ do {
+ result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+ page_prune_cb, NULL);
+ if (result == CLP_GANG_RESCHED)
+ cond_resched();
+ } while (result != CLP_GANG_OKAY);
+
+ cl_io_fini(env, io);
+ return result;
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+ int from, int to)
+{
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+ CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+ (const struct lu_env *,
+ const struct cl_page_slice *,int, int),
+ from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_page *pg)
+{
+ (*printer)(env, cookie,
+ "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+ pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+ pg->cp_index, pg->cp_parent, pg->cp_child,
+ pg->cp_state, pg->cp_error, pg->cp_type,
+ pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_page *pg)
+{
+ struct cl_page *scan;
+
+ for (scan = cl_page_top((struct cl_page *)pg);
+ scan != NULL; scan = scan->cp_child)
+ cl_page_header_print(env, cookie, printer, scan);
+ CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+ (const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t p), cookie, printer);
+ (*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+ return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+ (const struct lu_env *,
+ const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+ /*
+ * XXX for now.
+ */
+ return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+ /*
+ * XXX for now.
+ */
+ return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+ return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+ struct cl_object *obj,
+ const struct cl_page_operations *ops)
+{
+ list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+ slice->cpl_obj = obj;
+ slice->cpl_ops = ops;
+ slice->cpl_page = page;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int cl_page_init(void)
+{
+ return 0;
+}
+
+void cl_page_fini(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644
index 000000000..d4b74b670
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -0,0 +1,704 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <linux/atomic.h>
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../../include/linux/lnet/lnetctl.h"
+#include "../include/lustre_debug.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre/lustre_build_version.h"
+#include <linux/list.h>
+#include "../include/cl_object.h"
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+EXPORT_SYMBOL(obd_max_pages);
+__u64 obd_max_alloc = 0;
+EXPORT_SYMBOL(obd_max_alloc);
+__u64 obd_alloc;
+EXPORT_SYMBOL(obd_alloc);
+__u64 obd_pages;
+EXPORT_SYMBOL(obd_pages);
+static DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+char obd_jobid_node[JOBSTATS_JOBID_SIZE + 1];
+
+/* Get jobid of current process from stored variable or calculate
+ * it from pid and user_id.
+ *
+ * Historically this was also done by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ * This is now deprecated.
+ */
+int lustre_get_jobid(char *jobid)
+{
+ memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+ /* Jobstats isn't enabled */
+ if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+ return 0;
+
+ /* Use process name + fsuid as jobid */
+ if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+ snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+ current_comm(),
+ from_kuid(&init_user_ns, current_fsuid()));
+ return 0;
+ }
+
+ /* Whole node dedicated to single job */
+ if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) {
+ strcpy(jobid, obd_jobid_node);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+ size_t size, const char *file, int line)
+{
+ if (ptr == NULL ||
+ (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+ CERROR("%s%salloc of %s (%llu bytes) failed at %s:%d\n",
+ ptr ? "force " :"", type, name, (__u64)size, file,
+ line);
+ CERROR("%llu total bytes and %llu total pages (%llu bytes) allocated by Lustre, %d total bytes by LNET\n",
+ obd_memory_sum(),
+ obd_pages_sum() << PAGE_CACHE_SHIFT,
+ obd_pages_sum(),
+ atomic_read(&libcfs_kmemory));
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+ struct obd_ioctl_data *data)
+{
+ memset(conn, 0, sizeof(*conn));
+ conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+ struct lustre_handle *conn)
+{
+ data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+ int rc;
+ int dev;
+
+ if (!len || !name) {
+ CERROR("No name passed,!\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ if (name[len - 1] != 0) {
+ CERROR("Name not nul terminated!\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ CDEBUG(D_IOCTL, "device name %s\n", name);
+ dev = class_name2dev(name);
+ if (dev == -1) {
+ CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+ rc = dev;
+
+out:
+ return rc;
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+ char *buf = NULL;
+ struct obd_ioctl_data *data;
+ struct libcfs_debug_ioctl_data *debug_data;
+ struct obd_device *obd = NULL;
+ int err = 0, len = 0;
+
+ /* only for debugging */
+ if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+ debug_data = (struct libcfs_debug_ioctl_data *)arg;
+ libcfs_subsystem_debug = debug_data->subs;
+ libcfs_debug = debug_data->debug;
+ return 0;
+ }
+
+ CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+ if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+ CERROR("OBD ioctl: data error\n");
+ return -EINVAL;
+ }
+ data = (struct obd_ioctl_data *)buf;
+
+ switch (cmd) {
+ case OBD_IOC_PROCESS_CFG: {
+ struct lustre_cfg *lcfg;
+
+ if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+ CERROR("No config buffer passed!\n");
+ err = -EINVAL;
+ goto out;
+ }
+ OBD_ALLOC(lcfg, data->ioc_plen1);
+ if (lcfg == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = copy_from_user(lcfg, data->ioc_pbuf1,
+ data->ioc_plen1);
+ if (!err)
+ err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+ if (!err)
+ err = class_process_config(lcfg);
+
+ OBD_FREE(lcfg, data->ioc_plen1);
+ goto out;
+ }
+
+ case OBD_GET_VERSION:
+ if (!data->ioc_inlbuf1) {
+ CERROR("No buffer passed in ioctl\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+ CERROR("ioctl buffer too small to hold version\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(data->ioc_bulk, BUILD_VERSION,
+ strlen(BUILD_VERSION) + 1);
+
+ err = obd_ioctl_popdata((void *)arg, data, len);
+ if (err)
+ err = -EFAULT;
+ goto out;
+
+ case OBD_IOC_NAME2DEV: {
+ /* Resolve a device name. This does not change the
+ * currently selected device.
+ */
+ int dev;
+
+ dev = class_resolve_dev_name(data->ioc_inllen1,
+ data->ioc_inlbuf1);
+ data->ioc_dev = dev;
+ if (dev < 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+ if (err)
+ err = -EFAULT;
+ goto out;
+ }
+
+ case OBD_IOC_UUID2DEV: {
+ /* Resolve a device uuid. This does not change the
+ * currently selected device.
+ */
+ int dev;
+ struct obd_uuid uuid;
+
+ if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+ CERROR("No UUID passed!\n");
+ err = -EINVAL;
+ goto out;
+ }
+ if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+ CERROR("UUID not NUL terminated!\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+ obd_str2uuid(&uuid, data->ioc_inlbuf1);
+ dev = class_uuid2dev(&uuid);
+ data->ioc_dev = dev;
+ if (dev == -1) {
+ CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+ data->ioc_inlbuf1);
+ err = -EINVAL;
+ goto out;
+ }
+
+ CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+ dev);
+ err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+ if (err)
+ err = -EFAULT;
+ goto out;
+ }
+
+ case OBD_IOC_CLOSE_UUID: {
+ CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+ data->ioc_inlbuf1);
+ err = 0;
+ goto out;
+ }
+
+ case OBD_IOC_GETDEVICE: {
+ int index = data->ioc_count;
+ char *status, *str;
+
+ if (!data->ioc_inlbuf1) {
+ CERROR("No buffer passed in ioctl\n");
+ err = -EINVAL;
+ goto out;
+ }
+ if (data->ioc_inllen1 < 128) {
+ CERROR("ioctl buffer too small to hold version\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ obd = class_num2obd(index);
+ if (!obd) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (obd->obd_stopping)
+ status = "ST";
+ else if (obd->obd_set_up)
+ status = "UP";
+ else if (obd->obd_attached)
+ status = "AT";
+ else
+ status = "--";
+ str = (char *)data->ioc_bulk;
+ snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+ (int)index, status, obd->obd_type->typ_name,
+ obd->obd_name, obd->obd_uuid.uuid,
+ atomic_read(&obd->obd_refcount));
+ err = obd_ioctl_popdata((void *)arg, data, len);
+
+ err = 0;
+ goto out;
+ }
+
+ }
+
+ if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+ if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL) {
+ err = -EINVAL;
+ goto out;
+ }
+ if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) {
+ err = -EINVAL;
+ goto out;
+ }
+ obd = class_name2obd(data->ioc_inlbuf4);
+ } else if (data->ioc_dev < class_devno_max()) {
+ obd = class_num2obd(data->ioc_dev);
+ } else {
+ CERROR("OBD ioctl: No device\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (obd == NULL) {
+ CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+ err = -EINVAL;
+ goto out;
+ }
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+ if (!obd->obd_set_up || obd->obd_stopping) {
+ CERROR("OBD ioctl: device not setup %d\n", data->ioc_dev);
+ err = -EINVAL;
+ goto out;
+ }
+
+ switch (cmd) {
+ case OBD_IOC_NO_TRANSNO: {
+ if (!obd->obd_attached) {
+ CERROR("Device %d not attached\n", obd->obd_minor);
+ err = -ENODEV;
+ goto out;
+ }
+ CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+ obd->obd_name);
+ obd->obd_no_transno = 1;
+ err = 0;
+ goto out;
+ }
+
+ default: {
+ err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+ if (err)
+ goto out;
+
+ err = obd_ioctl_popdata((void *)arg, data, len);
+ if (err)
+ err = -EFAULT;
+ goto out;
+ }
+ }
+
+ out:
+ if (buf)
+ obd_ioctl_freedata(buf, len);
+ return err;
+} /* class_handle_ioctl */
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+ __u64 u64val, div64val;
+ char buf[64];
+ int len, ret = 0;
+
+ CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", "%llu", "%lld", "%#llx");
+
+ CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF);
+
+ u64val = OBD_OBJECT_EOF;
+ CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+ if (u64val != OBD_OBJECT_EOF) {
+ CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+ u64val, (int)sizeof(u64val));
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+ if (len != 18) {
+ CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+ ret = -EINVAL;
+ }
+
+ div64val = OBD_OBJECT_EOF;
+ CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val);
+ if (u64val != OBD_OBJECT_EOF) {
+ CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+ u64val, (int)sizeof(u64val));
+ ret = -EOVERFLOW;
+ }
+ if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+ CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n",
+ u64val, (int)sizeof(u64val));
+ return -EOVERFLOW;
+ }
+ if (do_div(div64val, 256) != (u64val & 255)) {
+ CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val &255);
+ return -EOVERFLOW;
+ }
+ if (u64val >> 8 != div64val) {
+ CERROR("do_div(%#llx,256) %llu != %llu\n",
+ u64val, div64val, u64val >> 8);
+ return -EOVERFLOW;
+ }
+ len = snprintf(buf, sizeof(buf), "%#llx", u64val);
+ if (len != 18) {
+ CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), "%llu", u64val);
+ if (len != 20) {
+ CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), "%lld", u64val);
+ if (len != 2) {
+ CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+ ret = -EINVAL;
+ }
+ if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+ CWARN("mask failed: u64val %llu >= %llu\n", u64val,
+ (__u64)PAGE_CACHE_SIZE);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+#if defined (CONFIG_PROC_FS)
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+#else
+static inline int class_procfs_init(void)
+{ return 0; }
+static inline int class_procfs_clean(void)
+{ return 0; }
+#endif
+
+static int __init init_obdclass(void)
+{
+ int i, err;
+ int lustre_register_fs(void);
+
+ for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+ INIT_LIST_HEAD(&capa_list[i]);
+
+ LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+ spin_lock_init(&obd_types_lock);
+ obd_zombie_impexp_init();
+
+ if (IS_ENABLED(CONFIG_PROC_FS)) {
+ obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+ LPROCFS_STATS_FLAG_NONE |
+ LPROCFS_STATS_FLAG_IRQ_SAFE);
+
+ if (obd_memory == NULL) {
+ CERROR("kmalloc of 'obd_memory' failed\n");
+ return -ENOMEM;
+ }
+
+ lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+ LPROCFS_CNTR_AVGMINMAX,
+ "memused", "bytes");
+ lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+ LPROCFS_CNTR_AVGMINMAX,
+ "pagesused", "pages");
+ }
+
+ err = obd_init_checks();
+ if (err == -EOVERFLOW)
+ return err;
+
+ class_init_uuidlist();
+ err = class_handle_init();
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&obd_types);
+
+ err = misc_register(&obd_psdev);
+ if (err) {
+ CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+ return err;
+ }
+
+ /* This struct is already zeroed for us (static global) */
+ for (i = 0; i < class_devno_max(); i++)
+ obd_devs[i] = NULL;
+
+ /* Default the dirty page cache cap to 1/2 of system memory.
+ * For clients with less memory, a larger fraction is needed
+ * for other purposes (mostly for BGL). */
+ if (totalram_pages <= 512 << (20 - PAGE_CACHE_SHIFT))
+ obd_max_dirty_pages = totalram_pages / 4;
+ else
+ obd_max_dirty_pages = totalram_pages / 2;
+
+ err = obd_init_caches();
+ if (err)
+ return err;
+
+ obd_sysctl_init();
+
+ err = class_procfs_init();
+ if (err)
+ return err;
+
+ err = lu_global_init();
+ if (err)
+ return err;
+
+ err = cl_global_init();
+ if (err != 0)
+ return err;
+
+
+ err = llog_info_init();
+ if (err)
+ return err;
+
+ err = lustre_register_fs();
+
+ return err;
+}
+
+void obd_update_maxusage(void)
+{
+ __u64 max1, max2;
+
+ max1 = obd_pages_sum();
+ max2 = obd_memory_sum();
+
+ spin_lock(&obd_updatemax_lock);
+ if (max1 > obd_max_pages)
+ obd_max_pages = max1;
+ if (max2 > obd_max_alloc)
+ obd_max_alloc = max2;
+ spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#if defined (CONFIG_PROC_FS)
+__u64 obd_memory_max(void)
+{
+ __u64 ret;
+
+ spin_lock(&obd_updatemax_lock);
+ ret = obd_max_alloc;
+ spin_unlock(&obd_updatemax_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+ __u64 ret;
+
+ spin_lock(&obd_updatemax_lock);
+ ret = obd_max_pages;
+ spin_unlock(&obd_updatemax_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently. we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+ int i;
+ int lustre_unregister_fs(void);
+ __u64 memory_leaked, pages_leaked;
+ __u64 memory_max, pages_max;
+
+ lustre_unregister_fs();
+
+ misc_deregister(&obd_psdev);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+ if (obd && obd->obd_set_up &&
+ OBT(obd) && OBP(obd, detach)) {
+ /* XXX should this call generic detach otherwise? */
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ OBP(obd, detach)(obd);
+ }
+ }
+ llog_info_fini();
+ cl_global_fini();
+ lu_global_fini();
+
+ obd_cleanup_caches();
+ obd_sysctl_clean();
+
+ class_procfs_clean();
+
+ class_handle_cleanup();
+ class_exit_uuidlist();
+ obd_zombie_impexp_stop();
+
+ memory_leaked = obd_memory_sum();
+ pages_leaked = obd_pages_sum();
+
+ memory_max = obd_memory_max();
+ pages_max = obd_pages_max();
+
+ lprocfs_free_stats(&obd_memory);
+ CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+ "obd_memory max: %llu, leaked: %llu\n",
+ memory_max, memory_leaked);
+ CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+ "obd_memory_pages max: %llu, leaked: %llu\n",
+ pages_max, pages_leaked);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(init_obdclass);
+module_exit(cleanup_obdclass);
diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644
index 000000000..9c934e6d2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/debug.c
@@ -0,0 +1,109 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+#include <linux/unaligned/access_ok.h>
+
+#include "../include/obd_support.h"
+#include "../include/lustre_debug.h"
+#include "../include/lustre_net.h"
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+ CDEBUG(D_RPCTRACE,
+ "niobuf_local: file_offset=%lld, len=%d, page=%p, rc=%d\n",
+ nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+ CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+ nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+ LASSERT(addr);
+
+ put_unaligned_le64(off, addr);
+ put_unaligned_le64(id, addr+LPDS);
+ addr += len - LPDS - LPDS;
+ put_unaligned_le64(off, addr);
+ put_unaligned_le64(id, addr+LPDS);
+
+ return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+ __u64 ne_off;
+ int err = 0;
+
+ LASSERT(addr);
+
+ ne_off = le64_to_cpu (off);
+ id = le64_to_cpu (id);
+ if (memcmp(addr, (char *)&ne_off, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id %#llx offset %llu off: %#llx != %#llx\n",
+ who, id, off, *(__u64 *)addr, ne_off);
+ err = -EINVAL;
+ }
+ if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n",
+ who, id, off, *(__u64 *)(addr + LPDS), id);
+ err = -EINVAL;
+ }
+
+ addr += end - LPDS - LPDS;
+ if (memcmp(addr, (char *)&ne_off, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id %#llx offset %llu end off: %#llx != %#llx\n",
+ who, id, off, *(__u64 *)addr, ne_off);
+ err = -EINVAL;
+ }
+ if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id %#llx offset %llu end id: %#llx != %#llx\n",
+ who, id, off, *(__u64 *)(addr + LPDS), id);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644
index 000000000..b1eee0a6d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/dt_object.c
@@ -0,0 +1,1059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd.h"
+#include "../include/dt_object.h"
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include "../include/lustre_fid.h"
+
+#include "../include/lustre_quota.h"
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+ .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+ .lct_init = dt_global_key_init,
+ .lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+ list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+ list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+ struct dt_device *dev, struct thandle *th)
+{
+ int rc = 0;
+ struct dt_txn_callback *cb;
+
+ if (th->th_local)
+ return 0;
+
+ list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+ if (cb->dtc_txn_start == NULL ||
+ !(cb->dtc_tag & env->le_ctx.lc_tags))
+ continue;
+ rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+ if (rc < 0)
+ break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+ struct dt_device *dev = txn->th_dev;
+ struct dt_txn_callback *cb;
+ int rc = 0;
+
+ if (txn->th_local)
+ return 0;
+
+ list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+ if (cb->dtc_txn_stop == NULL ||
+ !(cb->dtc_tag & env->le_ctx.lc_tags))
+ continue;
+ rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+ if (rc < 0)
+ break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+ struct dt_txn_callback *cb;
+
+ if (txn->th_local)
+ return;
+
+ list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+ dtc_linkage) {
+ if (cb->dtc_txn_commit)
+ cb->dtc_txn_commit(txn, cb->dtc_cookie);
+ }
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+ INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+ return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+ lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+ struct lu_object_header *h, struct lu_device *d)
+
+{
+ return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+ lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+ if (obj->do_index_ops == NULL)
+ obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+ return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+ enum dt_format_type result;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ result = DFT_DIR;
+ break;
+ case S_IFREG:
+ result = DFT_REGULAR;
+ break;
+ case S_IFLNK:
+ result = DFT_SYM;
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ result = DFT_NODE;
+ break;
+ default:
+ LBUG();
+ break;
+ }
+ return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+ const char *name, struct lu_fid *fid)
+{
+ if (dt_try_as_dir(env, dir))
+ return dt_lookup(env, dir, (struct dt_rec *)fid,
+ (const struct dt_key *)name, BYPASS_CAPA);
+ return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+ struct dt_device *dev, const struct lu_fid *fid,
+ struct lu_device *top_dev)
+{
+ struct lu_object *lo, *n;
+
+ lo = lu_object_find_at(env, top_dev, fid, NULL);
+ if (IS_ERR(lo))
+ return (void *)lo;
+
+ LASSERT(lo != NULL);
+
+ list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+ if (n->lo_dev == &dev->dd_lu_dev)
+ return container_of0(n, struct dt_object, do_lu);
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+ struct dt_find_hint *dfh = data;
+ struct dt_device *dt = dfh->dfh_dt;
+ struct lu_fid *fid = dfh->dfh_fid;
+ struct dt_object *obj = dfh->dfh_o;
+ int result;
+
+ result = dt_lookup_dir(env, obj, entry, fid);
+ lu_object_put(env, &obj->do_lu);
+ if (result == 0) {
+ obj = dt_locate(env, dt, fid);
+ if (IS_ERR(obj))
+ result = PTR_ERR(obj);
+ }
+ dfh->dfh_o = obj;
+ return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+ char *path, dt_entry_func_t entry_func,
+ void *data)
+{
+ char *e;
+ int rc = 0;
+
+ while (1) {
+ e = strsep(&path, "/");
+ if (e == NULL)
+ break;
+
+ if (e[0] == 0) {
+ if (!path || path[0] == '\0')
+ break;
+ continue;
+ }
+ rc = entry_func(env, e, data);
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+ const char *path, struct lu_fid *fid)
+{
+ struct dt_thread_info *info = dt_info(env);
+ struct dt_find_hint *dfh = &info->dti_dfh;
+ struct dt_object *obj;
+ char *local = info->dti_buf;
+ int result;
+
+
+ dfh->dfh_dt = dt;
+ dfh->dfh_fid = fid;
+
+ strncpy(local, path, DT_MAX_PATH);
+ local[DT_MAX_PATH - 1] = '\0';
+
+ result = dt->dd_ops->dt_root_get(env, dt, fid);
+ if (result == 0) {
+ obj = dt_locate(env, dt, fid);
+ if (!IS_ERR(obj)) {
+ dfh->dfh_o = obj;
+ result = dt_path_parser(env, local, dt_find_entry, dfh);
+ if (result != 0)
+ obj = ERR_PTR(result);
+ else
+ obj = dfh->dfh_o;
+ }
+ } else {
+ obj = ERR_PTR(result);
+ }
+ return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+ struct dt_device *dt,
+ struct dt_object *p,
+ const char *name,
+ struct lu_fid *fid)
+{
+ struct dt_object *o;
+ int result;
+
+ result = dt_lookup_dir(env, p, name, fid);
+ if (result == 0){
+ o = dt_locate(env, dt, fid);
+ } else
+ o = ERR_PTR(result);
+
+ return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ * \param dt dt device
+ * \param fid on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+ struct dt_device *dt,
+ const char *dirname,
+ const char *filename,
+ struct lu_fid *fid)
+{
+ struct dt_object *file;
+ struct dt_object *dir;
+
+ dir = dt_store_resolve(env, dt, dirname, fid);
+ if (!IS_ERR(dir)) {
+ file = dt_reg_open(env, dt, dir,
+ filename, fid);
+ lu_object_put(env, &dir->do_lu);
+ } else {
+ file = dir;
+ }
+ return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object_format *dof,
+ struct lu_attr *at)
+{
+ struct dt_object *dto;
+ struct thandle *th;
+ int rc;
+
+ dto = dt_locate(env, dt, fid);
+ if (IS_ERR(dto))
+ return dto;
+
+ LASSERT(dto != NULL);
+ if (dt_object_exists(dto))
+ return dto;
+
+ th = dt_trans_create(env, dt);
+ if (IS_ERR(th)) {
+ rc = PTR_ERR(th);
+ goto out;
+ }
+
+ rc = dt_declare_create(env, dto, at, NULL, dof, th);
+ if (rc)
+ goto trans_stop;
+
+ rc = dt_trans_start_local(env, dt, th);
+ if (rc)
+ goto trans_stop;
+
+ dt_write_lock(env, dto, 0);
+ if (dt_object_exists(dto)) {
+ rc = 0;
+ goto unlock;
+ }
+
+ CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+ rc = dt_create(env, dto, at, NULL, dof, th);
+ if (rc)
+ goto unlock;
+ LASSERT(dt_object_exists(dto));
+unlock:
+ dt_write_unlock(env, dto);
+trans_stop:
+ dt_trans_stop(env, dt, th);
+out:
+ if (rc) {
+ lu_object_put(env, &dto->do_lu);
+ return ERR_PTR(rc);
+ }
+ return dto;
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+ LU_CONTEXT_KEY_INIT(&dt_key);
+ return lu_context_key_register(&dt_key);
+}
+
+void dt_global_fini(void)
+{
+ lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env lustre environment
+ * \param dt object to be read
+ * \param buf lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos)
+{
+ LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+ return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage. Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env lustre environment
+ * \param dt object to be read
+ * \param buf lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos)
+{
+ int rc;
+
+ LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+ rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+ if (rc == buf->lb_len)
+ rc = 0;
+ else if (rc >= 0)
+ rc = -EFAULT;
+ return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+ int rc;
+
+ LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+ LASSERT(th != NULL);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_write);
+ rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+ if (rc == buf->lb_len)
+ rc = 0;
+ else if (rc >= 0)
+ rc = -EFAULT;
+ return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+ struct thandle *th)
+{
+ struct lu_buf vbuf;
+ char *xname = XATTR_NAME_VERSION;
+
+ LASSERT(o);
+ vbuf.lb_buf = NULL;
+ vbuf.lb_len = sizeof(dt_obj_version_t);
+ return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+ dt_obj_version_t version, struct thandle *th)
+{
+ struct lu_buf vbuf;
+ char *xname = XATTR_NAME_VERSION;
+ int rc;
+
+ LASSERT(o);
+ vbuf.lb_buf = &version;
+ vbuf.lb_len = sizeof(version);
+
+ rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+ if (rc < 0)
+ CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+ return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+ struct lu_buf vbuf;
+ char *xname = XATTR_NAME_VERSION;
+ dt_obj_version_t version;
+ int rc;
+
+ LASSERT(o);
+ vbuf.lb_buf = &version;
+ vbuf.lb_len = sizeof(version);
+ rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+ if (rc != sizeof(version)) {
+ CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+ version = 0;
+ }
+ return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+ .dif_flags = DT_IND_UPDATE,
+ .dif_keysize_min = sizeof(struct lu_fid),
+ .dif_keysize_max = sizeof(struct lu_fid),
+ .dif_recsize_min = sizeof(__u8),
+ .dif_recsize_max = sizeof(__u8),
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+ .dif_flags = DT_IND_UPDATE,
+ .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_recsize_min = sizeof(struct lquota_acct_rec), /* 16 bytes */
+ .dif_recsize_max = sizeof(struct lquota_acct_rec), /* 16 bytes */
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+ .dif_flags = DT_IND_UPDATE,
+ /* a different key would have to be used for per-directory quota */
+ .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_recsize_min = sizeof(struct lquota_glb_rec), /* 32 bytes */
+ .dif_recsize_max = sizeof(struct lquota_glb_rec), /* 32 bytes */
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+ .dif_flags = DT_IND_UPDATE,
+ /* a different key would have to be used for per-directory quota */
+ .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_recsize_min = sizeof(struct lquota_slv_rec), /* 8 bytes */
+ .dif_recsize_max = sizeof(struct lquota_slv_rec), /* 8 bytes */
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+ __u32 mode)
+{
+ if (seq == FID_SEQ_QUOTA_GLB) {
+ /* global quota index */
+ if (!S_ISREG(mode))
+ /* global quota index should be a regular file */
+ return ERR_PTR(-ENOENT);
+ return &dt_quota_glb_features;
+ } else if (seq == FID_SEQ_QUOTA) {
+ /* quota slave index */
+ if (!S_ISREG(mode))
+ /* slave index should be a regular file */
+ return ERR_PTR(-ENOENT);
+ return &dt_quota_slv_features;
+ } else if (seq >= FID_SEQ_NORMAL) {
+ /* object is part of the namespace, verify that it is a
+ * directory */
+ if (!S_ISDIR(mode))
+ /* sorry, we can only deal with directory */
+ return ERR_PTR(-ENOTDIR);
+ return &dt_directory_features;
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+ int nob, const struct dt_it_ops *iops,
+ struct dt_it *it, __u32 attr, void *arg)
+{
+ struct idx_info *ii = (struct idx_info *)arg;
+ struct lu_idxpage *lip = &lp->lp_idx;
+ char *entry;
+ int rc, size;
+
+ /* no support for variable key & record size for now */
+ LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+ LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+ /* initialize the header of the new container */
+ memset(lip, 0, LIP_HDR_SIZE);
+ lip->lip_magic = LIP_MAGIC;
+ nob -= LIP_HDR_SIZE;
+
+ /* compute size needed to store a key/record pair */
+ size = ii->ii_recsize + ii->ii_keysize;
+ if ((ii->ii_flags & II_FL_NOHASH) == 0)
+ /* add hash if the client wants it */
+ size += sizeof(__u64);
+
+ entry = lip->lip_entries;
+ do {
+ char *tmp_entry = entry;
+ struct dt_key *key;
+ __u64 hash;
+
+ /* fetch 64-bit hash value */
+ hash = iops->store(env, it);
+ ii->ii_hash_end = hash;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+ if (lip->lip_nr != 0) {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ if (nob < size) {
+ if (lip->lip_nr == 0)
+ rc = -EINVAL;
+ else
+ rc = 0;
+ goto out;
+ }
+
+ if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+ /* client wants to the 64-bit hash value associated with
+ * each record */
+ memcpy(tmp_entry, &hash, sizeof(hash));
+ tmp_entry += sizeof(hash);
+ }
+
+ /* then the key value */
+ LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+ key = iops->key(env, it);
+ memcpy(tmp_entry, key, ii->ii_keysize);
+ tmp_entry += ii->ii_keysize;
+
+ /* and finally the record */
+ rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+ if (rc != -ESTALE) {
+ if (rc != 0)
+ goto out;
+
+ /* hash/key/record successfully copied! */
+ lip->lip_nr++;
+ if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+ ii->ii_hash_start = hash;
+ entry = tmp_entry + ii->ii_recsize;
+ nob -= size;
+ }
+
+ /* move on to the next record */
+ do {
+ rc = iops->next(env, it);
+ } while (rc == -ESTALE);
+
+ } while (rc == 0);
+
+ goto out;
+out:
+ if (rc >= 0 && lip->lip_nr > 0)
+ /* one more container */
+ ii->ii_count++;
+ if (rc > 0)
+ /* no more entries */
+ ii->ii_hash_end = II_END_OFF;
+ return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ * with key/record pairs in the format wanted by the caller
+ * \param arg - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+ const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+ void *arg)
+{
+ struct dt_it *it;
+ const struct dt_it_ops *iops;
+ unsigned int pageidx, nob, nlupgs = 0;
+ int rc;
+
+ LASSERT(rdpg->rp_pages != NULL);
+ LASSERT(obj->do_index_ops != NULL);
+
+ nob = rdpg->rp_count;
+ if (nob <= 0)
+ return -EFAULT;
+
+ /* Iterate through index and fill containers from @rdpg */
+ iops = &obj->do_index_ops->dio_it;
+ LASSERT(iops != NULL);
+ it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+ if (IS_ERR(it))
+ return PTR_ERR(it);
+
+ rc = iops->load(env, it, rdpg->rp_hash);
+ if (rc == 0) {
+ /*
+ * Iterator didn't find record with exactly the key requested.
+ *
+ * It is currently either
+ *
+ * - positioned above record with key less than
+ * requested---skip it.
+ * - or not positioned at all (is in IAM_IT_SKEWED
+ * state)---position it on the next item.
+ */
+ rc = iops->next(env, it);
+ } else if (rc > 0) {
+ rc = 0;
+ }
+
+ /* Fill containers one after the other. There might be multiple
+ * containers per physical page.
+ *
+ * At this point and across for-loop:
+ * rc == 0 -> ok, proceed.
+ * rc > 0 -> end of index.
+ * rc < 0 -> error. */
+ for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+ union lu_page *lp;
+ int i;
+
+ LASSERT(pageidx < rdpg->rp_npages);
+ lp = kmap(rdpg->rp_pages[pageidx]);
+
+ /* fill lu pages */
+ for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+ rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+ iops, it, rdpg->rp_attrs, arg);
+ if (rc < 0)
+ break;
+ /* one more lu_page */
+ nlupgs++;
+ if (rc > 0)
+ /* end of index */
+ break;
+ }
+ kunmap(rdpg->rp_pages[i]);
+ }
+
+ iops->put(env, it);
+ iops->fini(env, it);
+
+ if (rc >= 0)
+ rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+ return rc;
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii - is the idx_info structure packed by the client in the
+ * OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+ struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+ const struct dt_index_features *feat;
+ struct dt_object *obj;
+ int rc;
+
+ /* rp_count shouldn't be null and should be a multiple of the container
+ * size */
+ if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+ return -EFAULT;
+
+ if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+ /* we don't support directory transfer via OBD_IDX_READ for the
+ * time being */
+ return -EOPNOTSUPP;
+
+ if (!fid_is_quota(&ii->ii_fid))
+ /* block access to all local files except quota files */
+ return -EPERM;
+
+ /* lookup index object subject to the transfer */
+ obj = dt_locate(env, dev, &ii->ii_fid);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ if (dt_object_exists(obj) == 0) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ /* fetch index features associated with index object */
+ feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+ lu_object_attr(&obj->do_lu));
+ if (IS_ERR(feat)) {
+ rc = PTR_ERR(feat);
+ goto out;
+ }
+
+ /* load index feature if not done already */
+ if (obj->do_index_ops == NULL) {
+ rc = obj->do_ops->do_index_try(env, obj, feat);
+ if (rc)
+ goto out;
+ }
+
+ /* fill ii_flags with supported index features */
+ ii->ii_flags &= II_FL_NOHASH;
+
+ ii->ii_keysize = feat->dif_keysize_max;
+ if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+ /* key size is variable */
+ ii->ii_flags |= II_FL_VARKEY;
+ /* we don't support variable key size for the time being */
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ii->ii_recsize = feat->dif_recsize_max;
+ if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+ /* record size is variable */
+ ii->ii_flags |= II_FL_VARREC;
+ /* we don't support variable record size for the time being */
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+ /* key isn't necessarily unique */
+ ii->ii_flags |= II_FL_NONUNQ;
+
+ dt_read_lock(env, obj, 0);
+ /* fetch object version before walking the index */
+ ii->ii_version = dt_version_get(env, obj);
+
+ /* walk the index and fill lu_idxpages with key/record pairs */
+ rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+ dt_read_unlock(env, obj);
+
+ if (rc == 0) {
+ /* index is empty */
+ LASSERT(ii->ii_count == 0);
+ ii->ii_hash_end = II_END_OFF;
+ }
+
+ goto out;
+out:
+ lu_object_put(env, &obj->do_lu);
+ return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#if defined (CONFIG_PROC_FS)
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+ int rc = dt_statfs(NULL, dt, &osfs);
+
+ if (rc == 0) {
+ *eof = 1;
+ rc = snprintf(page, count, "%u\n",
+ (unsigned) osfs.os_bsize);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+ int rc = dt_statfs(NULL, dt, &osfs);
+
+ if (rc == 0) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_blocks;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ *eof = 1;
+ rc = snprintf(page, count, "%llu\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+ int rc = dt_statfs(NULL, dt, &osfs);
+
+ if (rc == 0) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bfree;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ *eof = 1;
+ rc = snprintf(page, count, "%llu\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+ int rc = dt_statfs(NULL, dt, &osfs);
+
+ if (rc == 0) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bavail;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ *eof = 1;
+ rc = snprintf(page, count, "%llu\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+ int rc = dt_statfs(NULL, dt, &osfs);
+
+ if (rc == 0) {
+ *eof = 1;
+ rc = snprintf(page, count, "%llu\n", osfs.os_files);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+ int rc = dt_statfs(NULL, dt, &osfs);
+
+ if (rc == 0) {
+ *eof = 1;
+ rc = snprintf(page, count, "%llu\n", osfs.os_ffree);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644
index 000000000..66b56784f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/genops.c
@@ -0,0 +1,1833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+static struct kmem_cache *import_cachep;
+
+static struct list_head obd_zombie_imports;
+static struct list_head obd_zombie_exports;
+static spinlock_t obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+ const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+ struct obd_device *obd;
+
+ OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, GFP_NOFS);
+ if (obd != NULL) {
+ obd->obd_magic = OBD_DEVICE_MAGIC;
+ }
+ return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+ LASSERT(obd != NULL);
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ if (obd->obd_namespace != NULL) {
+ CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+ obd, obd->obd_namespace, obd->obd_force);
+ LBUG();
+ }
+ lu_ref_fini(&obd->obd_reference);
+ OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+ struct list_head *tmp;
+ struct obd_type *type;
+
+ spin_lock(&obd_types_lock);
+ list_for_each(tmp, &obd_types) {
+ type = list_entry(tmp, struct obd_type, typ_chain);
+ if (strcmp(type->typ_name, name) == 0) {
+ spin_unlock(&obd_types_lock);
+ return type;
+ }
+ }
+ spin_unlock(&obd_types_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+ struct obd_type *type = class_search_type(name);
+
+ if (!type) {
+ const char *modname = name;
+
+ if (strcmp(modname, "obdfilter") == 0)
+ modname = "ofd";
+
+ if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+ modname = LUSTRE_OSP_NAME;
+
+ if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+ modname = LUSTRE_MDT_NAME;
+
+ if (!request_module("%s", modname)) {
+ CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+ type = class_search_type(name);
+ } else {
+ LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+ modname);
+ }
+ }
+ if (type) {
+ spin_lock(&type->obd_type_lock);
+ type->typ_refcnt++;
+ try_module_get(type->typ_dt_ops->o_owner);
+ spin_unlock(&type->obd_type_lock);
+ }
+ return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+ LASSERT(type);
+ spin_lock(&type->obd_type_lock);
+ type->typ_refcnt--;
+ module_put(type->typ_dt_ops->o_owner);
+ spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+ struct lprocfs_vars *vars, const char *name,
+ struct lu_device_type *ldt)
+{
+ struct obd_type *type;
+ int rc = 0;
+
+ /* sanity check */
+ LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+ if (class_search_type(name)) {
+ CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+ return -EEXIST;
+ }
+
+ rc = -ENOMEM;
+ OBD_ALLOC(type, sizeof(*type));
+ if (type == NULL)
+ return rc;
+
+ OBD_ALLOC_PTR(type->typ_dt_ops);
+ OBD_ALLOC_PTR(type->typ_md_ops);
+ OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+ if (type->typ_dt_ops == NULL ||
+ type->typ_md_ops == NULL ||
+ type->typ_name == NULL)
+ goto failed;
+
+ *(type->typ_dt_ops) = *dt_ops;
+ /* md_ops is optional */
+ if (md_ops)
+ *(type->typ_md_ops) = *md_ops;
+ strcpy(type->typ_name, name);
+ spin_lock_init(&type->obd_type_lock);
+
+ type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+ vars, type);
+ if (IS_ERR(type->typ_procroot)) {
+ rc = PTR_ERR(type->typ_procroot);
+ type->typ_procroot = NULL;
+ goto failed;
+ }
+
+ if (ldt != NULL) {
+ type->typ_lu = ldt;
+ rc = lu_device_type_init(ldt);
+ if (rc != 0)
+ goto failed;
+ }
+
+ spin_lock(&obd_types_lock);
+ list_add(&type->typ_chain, &obd_types);
+ spin_unlock(&obd_types_lock);
+
+ return 0;
+
+ failed:
+ if (type->typ_name != NULL)
+ OBD_FREE(type->typ_name, strlen(name) + 1);
+ if (type->typ_md_ops != NULL)
+ OBD_FREE_PTR(type->typ_md_ops);
+ if (type->typ_dt_ops != NULL)
+ OBD_FREE_PTR(type->typ_dt_ops);
+ OBD_FREE(type, sizeof(*type));
+ return rc;
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+ struct obd_type *type = class_search_type(name);
+
+ if (!type) {
+ CERROR("unknown obd type\n");
+ return -EINVAL;
+ }
+
+ if (type->typ_refcnt) {
+ CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+ /* This is a bad situation, let's make the best of it */
+ /* Remove ops, but leave the name for debugging */
+ OBD_FREE_PTR(type->typ_dt_ops);
+ OBD_FREE_PTR(type->typ_md_ops);
+ return -EBUSY;
+ }
+
+ if (type->typ_procroot) {
+ lprocfs_remove(&type->typ_procroot);
+ }
+
+ if (type->typ_lu)
+ lu_device_type_fini(type->typ_lu);
+
+ spin_lock(&obd_types_lock);
+ list_del(&type->typ_chain);
+ spin_unlock(&obd_types_lock);
+ OBD_FREE(type->typ_name, strlen(name) + 1);
+ if (type->typ_dt_ops != NULL)
+ OBD_FREE_PTR(type->typ_dt_ops);
+ if (type->typ_md_ops != NULL)
+ OBD_FREE_PTR(type->typ_md_ops);
+ OBD_FREE(type, sizeof(*type));
+ return 0;
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ * pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+ struct obd_device *result = NULL;
+ struct obd_device *newdev;
+ struct obd_type *type = NULL;
+ int i;
+ int new_obd_minor = 0;
+
+ if (strlen(name) >= MAX_OBD_NAME) {
+ CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+ return ERR_PTR(-EINVAL);
+ }
+
+ type = class_get_type(type_name);
+ if (type == NULL){
+ CERROR("OBD: unknown type: %s\n", type_name);
+ return ERR_PTR(-ENODEV);
+ }
+
+ newdev = obd_device_alloc();
+ if (newdev == NULL) {
+ result = ERR_PTR(-ENOMEM);
+ goto out_type;
+ }
+
+ LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+ write_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd && (strcmp(name, obd->obd_name) == 0)) {
+ CERROR("Device %s already exists at %d, won't add\n",
+ name, i);
+ if (result) {
+ LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+ "%p obd_magic %08x != %08x\n", result,
+ result->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(result->obd_minor == new_obd_minor,
+ "%p obd_minor %d != %d\n", result,
+ result->obd_minor, new_obd_minor);
+
+ obd_devs[result->obd_minor] = NULL;
+ result->obd_name[0] = '\0';
+ }
+ result = ERR_PTR(-EEXIST);
+ break;
+ }
+ if (!result && !obd) {
+ result = newdev;
+ result->obd_minor = i;
+ new_obd_minor = i;
+ result->obd_type = type;
+ strncpy(result->obd_name, name,
+ sizeof(result->obd_name) - 1);
+ obd_devs[i] = result;
+ }
+ }
+ write_unlock(&obd_dev_lock);
+
+ if (result == NULL && i >= class_devno_max()) {
+ CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+ class_devno_max());
+ result = ERR_PTR(-EOVERFLOW);
+ goto out;
+ }
+
+ if (IS_ERR(result))
+ goto out;
+
+ CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+ result->obd_name, result);
+
+ return result;
+out:
+ obd_device_free(newdev);
+out_type:
+ class_put_type(type);
+ return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+ struct obd_type *obd_type = obd->obd_type;
+
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+ obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+ LASSERT(obd_type != NULL);
+
+ CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+ obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+ write_lock(&obd_dev_lock);
+ obd_devs[obd->obd_minor] = NULL;
+ write_unlock(&obd_dev_lock);
+ obd_device_free(obd);
+
+ class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+ int i;
+
+ if (!name)
+ return -1;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd && strcmp(name, obd->obd_name) == 0) {
+ /* Make sure we finished attaching before we give
+ out any references */
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ if (obd->obd_attached) {
+ read_unlock(&obd_dev_lock);
+ return i;
+ }
+ break;
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+ int dev = class_name2dev(name);
+
+ if (dev < 0 || dev > class_devno_max())
+ return NULL;
+ return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+ int i;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ read_unlock(&obd_dev_lock);
+ return i;
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+ int dev = class_uuid2dev(uuid);
+ if (dev < 0)
+ return NULL;
+ return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ * otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+ struct obd_device *obd = NULL;
+
+ if (num < class_devno_max()) {
+ obd = obd_devs[num];
+ if (obd == NULL)
+ return NULL;
+
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+ "%p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(obd->obd_minor == num,
+ "%p obd_minor %0d != %0d\n",
+ obd, obd->obd_minor, num);
+ }
+
+ return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ * state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+ int index, max_index = class_devno_max(), dev_count = 0;
+
+ read_lock(&obd_dev_lock);
+ for (index = 0; index <= max_index; index++) {
+ struct obd_device *obd = class_num2obd(index);
+ if (obd != NULL)
+ dev_count++;
+ }
+ read_unlock(&obd_dev_lock);
+
+ return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+ char *status;
+ int i;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd == NULL)
+ continue;
+ if (obd->obd_stopping)
+ status = "ST";
+ else if (obd->obd_set_up)
+ status = "UP";
+ else if (obd->obd_attached)
+ status = "AT";
+ else
+ status = "--";
+ LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+ i, status, obd->obd_type->typ_name,
+ obd->obd_name, obd->obd_uuid.uuid,
+ atomic_read(&obd->obd_refcount));
+ }
+ read_unlock(&obd_dev_lock);
+ return;
+}
+
+/* Search for a client OBD connected to tgt_uuid. If grp_uuid is
+ specified, then only the client with that uuid is returned,
+ otherwise any client connected to the tgt is returned. */
+struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid,
+ const char *typ_name,
+ struct obd_uuid *grp_uuid)
+{
+ int i;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd == NULL)
+ continue;
+ if ((strncmp(obd->obd_type->typ_name, typ_name,
+ strlen(typ_name)) == 0)) {
+ if (obd_uuid_equals(tgt_uuid,
+ &obd->u.cli.cl_target_uuid) &&
+ ((grp_uuid)? obd_uuid_equals(grp_uuid,
+ &obd->obd_uuid) : 1)) {
+ read_unlock(&obd_dev_lock);
+ return obd;
+ }
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+ searching at *next, and if a device is found, the next index to look
+ at is saved in *next. If next is NULL, then the first matching device
+ will always be returned. */
+struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+ int i;
+
+ if (next == NULL)
+ i = 0;
+ else if (*next >= 0 && *next < class_devno_max())
+ i = *next;
+ else
+ return NULL;
+
+ read_lock(&obd_dev_lock);
+ for (; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd == NULL)
+ continue;
+ if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+ if (next != NULL)
+ *next = i+1;
+ read_unlock(&obd_dev_lock);
+ return obd;
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+ struct obd_device *obd;
+ const char *type;
+ int i, rc = 0, rc2;
+
+ LASSERT(namelen > 0);
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ obd = class_num2obd(i);
+
+ if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+ continue;
+
+ /* only notify mdc, osc, mdt, ost */
+ type = obd->obd_type->typ_name;
+ if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+ strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+ strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+ strcmp(type, LUSTRE_OST_NAME) != 0)
+ continue;
+
+ if (strncmp(obd->obd_name, fsname, namelen))
+ continue;
+
+ class_incref(obd, __func__, obd);
+ read_unlock(&obd_dev_lock);
+ rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+ sizeof(KEY_SPTLRPC_CONF),
+ KEY_SPTLRPC_CONF, 0, NULL, NULL);
+ rc = rc ? rc : rc2;
+ class_decref(obd, __func__, obd);
+ read_lock(&obd_dev_lock);
+ }
+ read_unlock(&obd_dev_lock);
+ return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+ if (obd_device_cachep) {
+ kmem_cache_destroy(obd_device_cachep);
+ obd_device_cachep = NULL;
+ }
+ if (obdo_cachep) {
+ kmem_cache_destroy(obdo_cachep);
+ obdo_cachep = NULL;
+ }
+ if (import_cachep) {
+ kmem_cache_destroy(import_cachep);
+ import_cachep = NULL;
+ }
+ if (capa_cachep) {
+ kmem_cache_destroy(capa_cachep);
+ capa_cachep = NULL;
+ }
+}
+
+int obd_init_caches(void)
+{
+ LASSERT(obd_device_cachep == NULL);
+ obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+ sizeof(struct obd_device),
+ 0, 0, NULL);
+ if (!obd_device_cachep)
+ goto out;
+
+ LASSERT(obdo_cachep == NULL);
+ obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+ 0, 0, NULL);
+ if (!obdo_cachep)
+ goto out;
+
+ LASSERT(import_cachep == NULL);
+ import_cachep = kmem_cache_create("ll_import_cache",
+ sizeof(struct obd_import),
+ 0, 0, NULL);
+ if (!import_cachep)
+ goto out;
+
+ LASSERT(capa_cachep == NULL);
+ capa_cachep = kmem_cache_create("capa_cache",
+ sizeof(struct obd_capa), 0, 0, NULL);
+ if (!capa_cachep)
+ goto out;
+
+ return 0;
+ out:
+ obd_cleanup_caches();
+ return -ENOMEM;
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+ struct obd_export *export;
+
+ if (!conn) {
+ CDEBUG(D_CACHE, "looking for null handle\n");
+ return NULL;
+ }
+
+ if (conn->cookie == -1) { /* this means assign a new connection */
+ CDEBUG(D_CACHE, "want a new connection\n");
+ return NULL;
+ }
+
+ CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie);
+ export = class_handle2object(conn->cookie);
+ return export;
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+ if (exp)
+ return exp->exp_obd;
+ return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+ struct obd_export *export;
+ export = class_conn2export(conn);
+ if (export) {
+ struct obd_device *obd = export->exp_obd;
+ class_export_put(export);
+ return obd;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+ struct obd_device *obd = exp->exp_obd;
+ if (obd == NULL)
+ return NULL;
+ return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+ struct obd_device *obd = class_conn2obd(conn);
+ if (obd == NULL)
+ return NULL;
+ return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+ struct obd_device *obd = exp->exp_obd;
+
+ LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+ LASSERT(obd != NULL);
+
+ CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+ exp->exp_client_uuid.uuid, obd->obd_name);
+
+ /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+ if (exp->exp_connection)
+ ptlrpc_put_connection_superhack(exp->exp_connection);
+
+ LASSERT(list_empty(&exp->exp_outstanding_replies));
+ LASSERT(list_empty(&exp->exp_uncommitted_replies));
+ LASSERT(list_empty(&exp->exp_req_replay_queue));
+ LASSERT(list_empty(&exp->exp_hp_rpcs));
+ obd_destroy_export(exp);
+ class_decref(obd, "export", exp);
+
+ OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+}
+
+static void export_handle_addref(void *export)
+{
+ class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+ .hop_addref = export_handle_addref,
+ .hop_free = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+ atomic_inc(&exp->exp_refcount);
+ CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+ atomic_read(&exp->exp_refcount));
+ return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+ CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+ atomic_read(&exp->exp_refcount) - 1);
+
+ if (atomic_dec_and_test(&exp->exp_refcount)) {
+ LASSERT(!list_empty(&exp->exp_obd_chain));
+ CDEBUG(D_IOCTL, "final put %p/%s\n",
+ exp, exp->exp_client_uuid.uuid);
+
+ /* release nid stat refererence */
+ lprocfs_exp_cleanup(exp);
+
+ obd_zombie_export_add(exp);
+ }
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+ struct obd_uuid *cluuid)
+{
+ struct obd_export *export;
+ struct cfs_hash *hash = NULL;
+ int rc = 0;
+
+ OBD_ALLOC_PTR(export);
+ if (!export)
+ return ERR_PTR(-ENOMEM);
+
+ export->exp_conn_cnt = 0;
+ export->exp_lock_hash = NULL;
+ export->exp_flock_hash = NULL;
+ atomic_set(&export->exp_refcount, 2);
+ atomic_set(&export->exp_rpc_count, 0);
+ atomic_set(&export->exp_cb_count, 0);
+ atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ INIT_LIST_HEAD(&export->exp_locks_list);
+ spin_lock_init(&export->exp_locks_list_guard);
+#endif
+ atomic_set(&export->exp_replay_count, 0);
+ export->exp_obd = obd;
+ INIT_LIST_HEAD(&export->exp_outstanding_replies);
+ spin_lock_init(&export->exp_uncommitted_replies_lock);
+ INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+ INIT_LIST_HEAD(&export->exp_req_replay_queue);
+ INIT_LIST_HEAD(&export->exp_handle.h_link);
+ INIT_LIST_HEAD(&export->exp_hp_rpcs);
+ class_handle_hash(&export->exp_handle, &export_handle_ops);
+ export->exp_last_request_time = get_seconds();
+ spin_lock_init(&export->exp_lock);
+ spin_lock_init(&export->exp_rpc_lock);
+ INIT_HLIST_NODE(&export->exp_uuid_hash);
+ INIT_HLIST_NODE(&export->exp_nid_hash);
+ spin_lock_init(&export->exp_bl_list_lock);
+ INIT_LIST_HEAD(&export->exp_bl_list);
+
+ export->exp_sp_peer = LUSTRE_SP_ANY;
+ export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+ export->exp_client_uuid = *cluuid;
+ obd_init_export(export);
+
+ spin_lock(&obd->obd_dev_lock);
+ /* shouldn't happen, but might race */
+ if (obd->obd_stopping) {
+ rc = -ENODEV;
+ goto exit_unlock;
+ }
+
+ hash = cfs_hash_getref(obd->obd_uuid_hash);
+ if (hash == NULL) {
+ rc = -ENODEV;
+ goto exit_unlock;
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+ rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+ if (rc != 0) {
+ LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+ obd->obd_name, cluuid->uuid, rc);
+ rc = -EALREADY;
+ goto exit_err;
+ }
+ }
+
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_stopping) {
+ cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+ rc = -ENODEV;
+ goto exit_unlock;
+ }
+
+ class_incref(obd, "export", export);
+ list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+ list_add_tail(&export->exp_obd_chain_timed,
+ &export->exp_obd->obd_exports_timed);
+ export->exp_obd->obd_num_exports++;
+ spin_unlock(&obd->obd_dev_lock);
+ cfs_hash_putref(hash);
+ return export;
+
+exit_unlock:
+ spin_unlock(&obd->obd_dev_lock);
+exit_err:
+ if (hash)
+ cfs_hash_putref(hash);
+ class_handle_unhash(&export->exp_handle);
+ LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+ obd_destroy_export(export);
+ OBD_FREE_PTR(export);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+ class_handle_unhash(&exp->exp_handle);
+
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+ /* delete an uuid-export hashitem from hashtables */
+ if (!hlist_unhashed(&exp->exp_uuid_hash))
+ cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+ &exp->exp_client_uuid,
+ &exp->exp_uuid_hash);
+
+ list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+ list_del_init(&exp->exp_obd_chain_timed);
+ exp->exp_obd->obd_num_exports--;
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+static void class_import_destroy(struct obd_import *imp)
+{
+ CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+ imp->imp_obd->obd_name);
+
+ LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+ ptlrpc_put_connection_superhack(imp->imp_connection);
+
+ while (!list_empty(&imp->imp_conn_list)) {
+ struct obd_import_conn *imp_conn;
+
+ imp_conn = list_entry(imp->imp_conn_list.next,
+ struct obd_import_conn, oic_item);
+ list_del_init(&imp_conn->oic_item);
+ ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+ OBD_FREE(imp_conn, sizeof(*imp_conn));
+ }
+
+ LASSERT(imp->imp_sec == NULL);
+ class_decref(imp->imp_obd, "import", imp);
+ OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+}
+
+static void import_handle_addref(void *import)
+{
+ class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+ .hop_addref = import_handle_addref,
+ .hop_free = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+ atomic_inc(&import->imp_refcount);
+ CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+ atomic_read(&import->imp_refcount),
+ import->imp_obd->obd_name);
+ return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+ LASSERT(list_empty(&imp->imp_zombie_chain));
+ LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+ CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+ atomic_read(&imp->imp_refcount) - 1,
+ imp->imp_obd->obd_name);
+
+ if (atomic_dec_and_test(&imp->imp_refcount)) {
+ CDEBUG(D_INFO, "final put import %p\n", imp);
+ obd_zombie_import_add(imp);
+ }
+
+ /* catch possible import put race */
+ LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+ int i;
+ at_init(&at->iat_net_latency, 0, 0);
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ /* max service estimates are tracked on the server side, so
+ don't use the AT history here, just use the last reported
+ val. (But keep hist for proc histogram, worst_ever) */
+ at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+ AT_FLG_NOHIST);
+ }
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+ struct obd_import *imp;
+
+ OBD_ALLOC(imp, sizeof(*imp));
+ if (imp == NULL)
+ return NULL;
+
+ INIT_LIST_HEAD(&imp->imp_pinger_chain);
+ INIT_LIST_HEAD(&imp->imp_zombie_chain);
+ INIT_LIST_HEAD(&imp->imp_replay_list);
+ INIT_LIST_HEAD(&imp->imp_sending_list);
+ INIT_LIST_HEAD(&imp->imp_delayed_list);
+ INIT_LIST_HEAD(&imp->imp_committed_list);
+ imp->imp_replay_cursor = &imp->imp_committed_list;
+ spin_lock_init(&imp->imp_lock);
+ imp->imp_last_success_conn = 0;
+ imp->imp_state = LUSTRE_IMP_NEW;
+ imp->imp_obd = class_incref(obd, "import", imp);
+ mutex_init(&imp->imp_sec_mutex);
+ init_waitqueue_head(&imp->imp_recovery_waitq);
+
+ atomic_set(&imp->imp_refcount, 2);
+ atomic_set(&imp->imp_unregistering, 0);
+ atomic_set(&imp->imp_inflight, 0);
+ atomic_set(&imp->imp_replay_inflight, 0);
+ atomic_set(&imp->imp_inval_count, 0);
+ INIT_LIST_HEAD(&imp->imp_conn_list);
+ INIT_LIST_HEAD(&imp->imp_handle.h_link);
+ class_handle_hash(&imp->imp_handle, &import_handle_ops);
+ init_imp_at(&imp->imp_at);
+
+ /* the default magic is V2, will be used in connect RPC, and
+ * then adjusted according to the flags in request/reply. */
+ imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+ return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+ LASSERT(import != NULL);
+ LASSERT(import != LP_POISON);
+
+ class_handle_unhash(&import->imp_handle);
+
+ spin_lock(&import->imp_lock);
+ import->imp_generation++;
+ spin_unlock(&import->imp_lock);
+ class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+ spin_lock(&exp->exp_locks_list_guard);
+
+ LASSERT(lock->l_exp_refs_nr >= 0);
+
+ if (lock->l_exp_refs_target != NULL &&
+ lock->l_exp_refs_target != exp) {
+ LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+ exp, lock, lock->l_exp_refs_target);
+ }
+ if ((lock->l_exp_refs_nr ++) == 0) {
+ list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+ lock->l_exp_refs_target = exp;
+ }
+ CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+ lock, exp, lock->l_exp_refs_nr);
+ spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+ spin_lock(&exp->exp_locks_list_guard);
+ LASSERT(lock->l_exp_refs_nr > 0);
+ if (lock->l_exp_refs_target != exp) {
+ LCONSOLE_WARN("lock %p, mismatching export pointers: %p, %p\n",
+ lock, lock->l_exp_refs_target, exp);
+ }
+ if (-- lock->l_exp_refs_nr == 0) {
+ list_del_init(&lock->l_exp_refs_link);
+ lock->l_exp_refs_target = NULL;
+ }
+ CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+ lock, exp, lock->l_exp_refs_nr);
+ spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+ be managed. This releases the export pointer reference, and returns
+ the export handle, so the export refcount is 1 when this function
+ returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+ struct obd_uuid *cluuid)
+{
+ struct obd_export *export;
+ LASSERT(conn != NULL);
+ LASSERT(obd != NULL);
+ LASSERT(cluuid != NULL);
+
+ export = class_new_export(obd, cluuid);
+ if (IS_ERR(export))
+ return PTR_ERR(export);
+
+ conn->cookie = export->exp_handle.h_cookie;
+ class_export_put(export);
+
+ CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n",
+ cluuid->uuid, conn->cookie);
+ return 0;
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+static void class_export_recovery_cleanup(struct obd_export *exp)
+{
+ struct obd_device *obd = exp->exp_obd;
+
+ spin_lock(&obd->obd_recovery_task_lock);
+ if (exp->exp_delayed)
+ obd->obd_delayed_clients--;
+ if (obd->obd_recovering) {
+ if (exp->exp_in_recovery) {
+ spin_lock(&exp->exp_lock);
+ exp->exp_in_recovery = 0;
+ spin_unlock(&exp->exp_lock);
+ LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+ atomic_dec(&obd->obd_connected_clients);
+ }
+
+ /* if called during recovery then should update
+ * obd_stale_clients counter,
+ * lightweight exports are not counted */
+ if (exp->exp_failed &&
+ (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+ exp->exp_obd->obd_stale_clients++;
+ }
+ spin_unlock(&obd->obd_recovery_task_lock);
+
+ spin_lock(&exp->exp_lock);
+ /** Cleanup req replay fields */
+ if (exp->exp_req_replay_needed) {
+ exp->exp_req_replay_needed = 0;
+
+ LASSERT(atomic_read(&obd->obd_req_replay_clients));
+ atomic_dec(&obd->obd_req_replay_clients);
+ }
+
+ /** Cleanup lock replay data */
+ if (exp->exp_lock_replay_needed) {
+ exp->exp_lock_replay_needed = 0;
+
+ LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+ atomic_dec(&obd->obd_lock_replay_clients);
+ }
+ spin_unlock(&exp->exp_lock);
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+ int already_disconnected;
+
+ if (export == NULL) {
+ CWARN("attempting to free NULL export %p\n", export);
+ return -EINVAL;
+ }
+
+ spin_lock(&export->exp_lock);
+ already_disconnected = export->exp_disconnected;
+ export->exp_disconnected = 1;
+ spin_unlock(&export->exp_lock);
+
+ /* class_cleanup(), abort_recovery(), and class_fail_export()
+ * all end up in here, and if any of them race we shouldn't
+ * call extra class_export_puts(). */
+ if (already_disconnected) {
+ LASSERT(hlist_unhashed(&export->exp_nid_hash));
+ goto no_disconn;
+ }
+
+ CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n",
+ export->exp_handle.h_cookie);
+
+ if (!hlist_unhashed(&export->exp_nid_hash))
+ cfs_hash_del(export->exp_obd->obd_nid_hash,
+ &export->exp_connection->c_peer.nid,
+ &export->exp_nid_hash);
+
+ class_export_recovery_cleanup(export);
+ class_unlink_export(export);
+no_disconn:
+ class_export_put(export);
+ return 0;
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+ if (exp) {
+ int connected;
+ spin_lock(&exp->exp_lock);
+ connected = exp->exp_conn_cnt > 0;
+ spin_unlock(&exp->exp_lock);
+ return connected;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+ enum obd_option flags)
+{
+ int rc;
+ struct obd_export *exp;
+
+ /* It's possible that an export may disconnect itself, but
+ * nothing else will be added to this list. */
+ while (!list_empty(list)) {
+ exp = list_entry(list->next, struct obd_export,
+ exp_obd_chain);
+ /* need for safe call CDEBUG after obd_disconnect */
+ class_export_get(exp);
+
+ spin_lock(&exp->exp_lock);
+ exp->exp_flags = flags;
+ spin_unlock(&exp->exp_lock);
+
+ if (obd_uuid_equals(&exp->exp_client_uuid,
+ &exp->exp_obd->obd_uuid)) {
+ CDEBUG(D_HA,
+ "exp %p export uuid == obd uuid, don't discon\n",
+ exp);
+ /* Need to delete this now so we don't end up pointing
+ * to work_list later when this export is cleaned up. */
+ list_del_init(&exp->exp_obd_chain);
+ class_export_put(exp);
+ continue;
+ }
+
+ class_export_get(exp);
+ CDEBUG(D_HA, "%s: disconnecting export at %s (%p), last request at " CFS_TIME_T "\n",
+ exp->exp_obd->obd_name, obd_export_nid2str(exp),
+ exp, exp->exp_last_request_time);
+ /* release one export reference anyway */
+ rc = obd_disconnect(exp);
+
+ CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+ obd_export_nid2str(exp), exp, rc);
+ class_export_put(exp);
+ }
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+ struct list_head work_list;
+
+ /* Move all of the exports from obd_exports to a work list, en masse. */
+ INIT_LIST_HEAD(&work_list);
+ spin_lock(&obd->obd_dev_lock);
+ list_splice_init(&obd->obd_exports, &work_list);
+ list_splice_init(&obd->obd_delayed_exports, &work_list);
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (!list_empty(&work_list)) {
+ CDEBUG(D_HA, "OBD device %d (%p) has exports, disconnecting them\n",
+ obd->obd_minor, obd);
+ class_disconnect_export_list(&work_list,
+ exp_flags_from_obd(obd));
+ } else
+ CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+ obd->obd_minor, obd);
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+ int (*test_export)(struct obd_export *))
+{
+ struct list_head work_list;
+ struct obd_export *exp, *n;
+ int evicted = 0;
+
+ INIT_LIST_HEAD(&work_list);
+ spin_lock(&obd->obd_dev_lock);
+ list_for_each_entry_safe(exp, n, &obd->obd_exports,
+ exp_obd_chain) {
+ /* don't count self-export as client */
+ if (obd_uuid_equals(&exp->exp_client_uuid,
+ &exp->exp_obd->obd_uuid))
+ continue;
+
+ /* don't evict clients which have no slot in last_rcvd
+ * (e.g. lightweight connection) */
+ if (exp->exp_target_data.ted_lr_idx == -1)
+ continue;
+
+ spin_lock(&exp->exp_lock);
+ if (exp->exp_failed || test_export(exp)) {
+ spin_unlock(&exp->exp_lock);
+ continue;
+ }
+ exp->exp_failed = 1;
+ spin_unlock(&exp->exp_lock);
+
+ list_move(&exp->exp_obd_chain, &work_list);
+ evicted++;
+ CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+ obd->obd_name, exp->exp_client_uuid.uuid,
+ exp->exp_connection == NULL ? "<unknown>" :
+ libcfs_nid2str(exp->exp_connection->c_peer.nid));
+ print_export_data(exp, "EVICTING", 0);
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (evicted)
+ LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+ obd->obd_name, evicted);
+
+ class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+ OBD_OPT_ABORT_RECOV);
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+ int rc, already_failed;
+
+ spin_lock(&exp->exp_lock);
+ already_failed = exp->exp_failed;
+ exp->exp_failed = 1;
+ spin_unlock(&exp->exp_lock);
+
+ if (already_failed) {
+ CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+ exp, exp->exp_client_uuid.uuid);
+ return;
+ }
+
+ CDEBUG(D_HA, "disconnecting export %p/%s\n",
+ exp, exp->exp_client_uuid.uuid);
+
+ if (obd_dump_on_timeout)
+ libcfs_debug_dumplog();
+
+ /* need for safe call CDEBUG after obd_disconnect */
+ class_export_get(exp);
+
+ /* Most callers into obd_disconnect are removing their own reference
+ * (request, for example) in addition to the one from the hash table.
+ * We don't have such a reference here, so make one. */
+ class_export_get(exp);
+ rc = obd_disconnect(exp);
+ if (rc)
+ CERROR("disconnecting export %p failed: %d\n", exp, rc);
+ else
+ CDEBUG(D_HA, "disconnected export %p/%s\n",
+ exp, exp->exp_client_uuid.uuid);
+ class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+ if (exp->exp_connection != NULL)
+ return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+ return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+ struct cfs_hash *nid_hash;
+ struct obd_export *doomed_exp = NULL;
+ int exports_evicted = 0;
+
+ lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+ spin_lock(&obd->obd_dev_lock);
+ /* umount has run already, so evict thread should leave
+ * its task to umount thread now */
+ if (obd->obd_stopping) {
+ spin_unlock(&obd->obd_dev_lock);
+ return exports_evicted;
+ }
+ nid_hash = obd->obd_nid_hash;
+ cfs_hash_getref(nid_hash);
+ spin_unlock(&obd->obd_dev_lock);
+
+ do {
+ doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+ if (doomed_exp == NULL)
+ break;
+
+ LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+ "nid %s found, wanted nid %s, requested nid %s\n",
+ obd_export_nid2str(doomed_exp),
+ libcfs_nid2str(nid_key), nid);
+ LASSERTF(doomed_exp != obd->obd_self_export,
+ "self-export is hashed by NID?\n");
+ exports_evicted++;
+ LCONSOLE_WARN("%s: evicting %s (at %s) by administrative request\n",
+ obd->obd_name,
+ obd_uuid2str(&doomed_exp->exp_client_uuid),
+ obd_export_nid2str(doomed_exp));
+ class_fail_export(doomed_exp);
+ class_export_put(doomed_exp);
+ } while (1);
+
+ cfs_hash_putref(nid_hash);
+
+ if (!exports_evicted)
+ CDEBUG(D_HA,
+ "%s: can't disconnect NID '%s': no exports found\n",
+ obd->obd_name, nid);
+ return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+ struct cfs_hash *uuid_hash;
+ struct obd_export *doomed_exp = NULL;
+ struct obd_uuid doomed_uuid;
+ int exports_evicted = 0;
+
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_stopping) {
+ spin_unlock(&obd->obd_dev_lock);
+ return exports_evicted;
+ }
+ uuid_hash = obd->obd_uuid_hash;
+ cfs_hash_getref(uuid_hash);
+ spin_unlock(&obd->obd_dev_lock);
+
+ obd_str2uuid(&doomed_uuid, uuid);
+ if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+ CERROR("%s: can't evict myself\n", obd->obd_name);
+ cfs_hash_putref(uuid_hash);
+ return exports_evicted;
+ }
+
+ doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+ if (doomed_exp == NULL) {
+ CERROR("%s: can't disconnect %s: no exports found\n",
+ obd->obd_name, uuid);
+ } else {
+ CWARN("%s: evicting %s at administrative request\n",
+ obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+ class_fail_export(doomed_exp);
+ class_export_put(doomed_exp);
+ exports_evicted++;
+ }
+ cfs_hash_putref(uuid_hash);
+
+ return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+ int locks)
+{
+ struct ptlrpc_reply_state *rs;
+ struct ptlrpc_reply_state *first_reply = NULL;
+ int nreplies = 0;
+
+ spin_lock(&exp->exp_lock);
+ list_for_each_entry(rs, &exp->exp_outstanding_replies,
+ rs_exp_list) {
+ if (nreplies == 0)
+ first_reply = rs;
+ nreplies++;
+ }
+ spin_unlock(&exp->exp_lock);
+
+ CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s %llu\n",
+ exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+ obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+ atomic_read(&exp->exp_rpc_count),
+ atomic_read(&exp->exp_cb_count),
+ atomic_read(&exp->exp_locks_count),
+ exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+ nreplies, first_reply, nreplies > 3 ? "..." : "",
+ exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ if (locks && class_export_dump_hook != NULL)
+ class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+ struct obd_export *exp;
+
+ spin_lock(&obd->obd_dev_lock);
+ list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+ print_export_data(exp, "ACTIVE", locks);
+ list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+ print_export_data(exp, "UNLINKED", locks);
+ list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+ print_export_data(exp, "DELAYED", locks);
+ spin_unlock(&obd->obd_dev_lock);
+ spin_lock(&obd_zombie_impexp_lock);
+ list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+ print_export_data(exp, "ZOMBIE", locks);
+ spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+ int waited = 2;
+ LASSERT(list_empty(&obd->obd_exports));
+ spin_lock(&obd->obd_dev_lock);
+ while (!list_empty(&obd->obd_unlinked_exports)) {
+ spin_unlock(&obd->obd_dev_lock);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(waited));
+ if (waited > 5 && IS_PO2(waited)) {
+ LCONSOLE_WARN("%s is waiting for obd_unlinked_exports more than %d seconds. The obd refcount = %d. Is it stuck?\n",
+ obd->obd_name, waited,
+ atomic_read(&obd->obd_refcount));
+ dump_exports(obd, 1);
+ }
+ waited *= 2;
+ spin_lock(&obd->obd_dev_lock);
+ }
+ spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+ struct obd_import *import;
+ struct obd_export *export;
+
+ do {
+ spin_lock(&obd_zombie_impexp_lock);
+
+ import = NULL;
+ if (!list_empty(&obd_zombie_imports)) {
+ import = list_entry(obd_zombie_imports.next,
+ struct obd_import,
+ imp_zombie_chain);
+ list_del_init(&import->imp_zombie_chain);
+ }
+
+ export = NULL;
+ if (!list_empty(&obd_zombie_exports)) {
+ export = list_entry(obd_zombie_exports.next,
+ struct obd_export,
+ exp_obd_chain);
+ list_del_init(&export->exp_obd_chain);
+ }
+
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ if (import != NULL) {
+ class_import_destroy(import);
+ spin_lock(&obd_zombie_impexp_lock);
+ zombies_count--;
+ spin_unlock(&obd_zombie_impexp_lock);
+ }
+
+ if (export != NULL) {
+ class_export_destroy(export);
+ spin_lock(&obd_zombie_impexp_lock);
+ zombies_count--;
+ spin_unlock(&obd_zombie_impexp_lock);
+ }
+
+ cond_resched();
+ } while (import != NULL || export != NULL);
+}
+
+static struct completion obd_zombie_start;
+static struct completion obd_zombie_stop;
+static unsigned long obd_zombie_flags;
+static wait_queue_head_t obd_zombie_waitq;
+static pid_t obd_zombie_pid;
+
+enum {
+ OBD_ZOMBIE_STOP = 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+ int rc;
+
+ spin_lock(&obd_zombie_impexp_lock);
+ rc = (zombies_count == 0) &&
+ !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ return rc;
+}
+
+/**
+ * Add export to the obd_zombie thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+ LASSERT(!list_empty(&exp->exp_obd_chain));
+ list_del_init(&exp->exp_obd_chain);
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ spin_lock(&obd_zombie_impexp_lock);
+ zombies_count++;
+ list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombie thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+ LASSERT(imp->imp_sec == NULL);
+ LASSERT(imp->imp_rq_pool == NULL);
+ spin_lock(&obd_zombie_impexp_lock);
+ LASSERT(list_empty(&imp->imp_zombie_chain));
+ zombies_count++;
+ list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+ /*
+ * Make sure obd_zombie_impexp_thread get this notification.
+ * It is possible this signal only get by obd_zombie_barrier, and
+ * barrier gulps this notification and sleeps away and hangs ensues
+ */
+ wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+ int rc;
+
+ LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+ spin_lock(&obd_zombie_impexp_lock);
+ rc = (zombies_count == 0);
+ spin_unlock(&obd_zombie_impexp_lock);
+ return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+ struct l_wait_info lwi = { 0 };
+
+ if (obd_zombie_pid == current_pid())
+ /* don't wait for myself */
+ return;
+ l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+ unshare_fs_struct();
+ complete(&obd_zombie_start);
+
+ obd_zombie_pid = current_pid();
+
+ while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+ struct l_wait_info lwi = { 0 };
+
+ l_wait_event(obd_zombie_waitq,
+ !obd_zombie_impexp_check(NULL), &lwi);
+ obd_zombie_impexp_cull();
+
+ /*
+ * Notify obd_zombie_barrier callers that queues
+ * may be empty.
+ */
+ wake_up(&obd_zombie_waitq);
+ }
+
+ complete(&obd_zombie_stop);
+
+ return 0;
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+ struct task_struct *task;
+
+ INIT_LIST_HEAD(&obd_zombie_imports);
+ INIT_LIST_HEAD(&obd_zombie_exports);
+ spin_lock_init(&obd_zombie_impexp_lock);
+ init_completion(&obd_zombie_start);
+ init_completion(&obd_zombie_stop);
+ init_waitqueue_head(&obd_zombie_waitq);
+ obd_zombie_pid = 0;
+
+ task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ wait_for_completion(&obd_zombie_start);
+ return 0;
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+ set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+ obd_zombie_impexp_notify();
+ wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+ return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr *kuc_ptr(void *p)
+{
+ struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+ LASSERT(lh->kuc_magic == KUC_MAGIC);
+ return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+ struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+ if (kh->kuc_magic == KUC_MAGIC)
+ return 1;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+ struct kuc_hdr *lh;
+ int len = kuc_len(payload_len);
+
+ OBD_ALLOC(lh, len);
+ if (lh == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ lh->kuc_magic = KUC_MAGIC;
+ lh->kuc_transport = transport;
+ lh->kuc_msgtype = type;
+ lh->kuc_msglen = len;
+
+ return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+ struct kuc_hdr *lh = kuc_ptr(p);
+ OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 000000000..06944b863
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
@@ -0,0 +1,449 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <asm/ioctls.h>
+#include <linux/poll.h>
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include "../../../include/linux/libcfs/libcfs.h"
+#include "../../../include/linux/lnet/lnetctl.h"
+#include "../../include/obd_support.h"
+#include "../../include/obd_class.h"
+#include "../../include/lprocfs_status.h"
+#include "../../include/lustre_ver.h"
+#include "../../include/lustre/lustre_build_version.h"
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+ struct obd_ioctl_hdr hdr;
+ struct obd_ioctl_data *data;
+ int err;
+ int offset = 0;
+
+ if (copy_from_user(&hdr, (void *)arg, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+ CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+ OBD_IOCTL_VERSION, hdr.ioc_version);
+ return -EINVAL;
+ }
+
+ if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+ CERROR("User buffer len %d exceeds %d max buffer\n",
+ hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+ return -EINVAL;
+ }
+
+ if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+ CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+ return -EINVAL;
+ }
+
+ /* When there are lots of processes calling vmalloc on multi-core
+ * system, the high lock contention will hurt performance badly,
+ * obdfilter-survey is an example, which relies on ioctl. So we'd
+ * better avoid vmalloc on ioctl path. LU-66 */
+ OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+ if (*buf == NULL) {
+ CERROR("Cannot allocate control buffer of len %d\n",
+ hdr.ioc_len);
+ return -EINVAL;
+ }
+ *len = hdr.ioc_len;
+ data = (struct obd_ioctl_data *)*buf;
+
+ if (copy_from_user(*buf, (void *)arg, hdr.ioc_len)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+ if (hdr.ioc_len != data->ioc_len) {
+ err = -EINVAL;
+ goto free_buf;
+ }
+
+ if (obd_ioctl_is_invalid(data)) {
+ CERROR("ioctl not correctly formatted\n");
+ err = -EINVAL;
+ goto free_buf;
+ }
+
+ if (data->ioc_inllen1) {
+ data->ioc_inlbuf1 = &data->ioc_bulk[0];
+ offset += cfs_size_round(data->ioc_inllen1);
+ }
+
+ if (data->ioc_inllen2) {
+ data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+ offset += cfs_size_round(data->ioc_inllen2);
+ }
+
+ if (data->ioc_inllen3) {
+ data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+ offset += cfs_size_round(data->ioc_inllen3);
+ }
+
+ if (data->ioc_inllen4) {
+ data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+ }
+
+ return 0;
+
+free_buf:
+ OBD_FREE_LARGE(*buf, hdr.ioc_len);
+ return err;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+ int err;
+
+ err = copy_to_user(arg, data, len);
+ if (err)
+ err = -EFAULT;
+ return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/* opening /dev/obd */
+static int obd_class_open(struct inode *inode, struct file *file)
+{
+ try_module_get(THIS_MODULE);
+ return 0;
+}
+
+/* closing /dev/obd */
+static int obd_class_release(struct inode *inode, struct file *file)
+{
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ int err = 0;
+
+ /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+ if (!capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+ return err = -EACCES;
+ if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+ return err = -ENOTTY;
+
+ err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+ return err;
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+ .open = obd_class_open, /* open */
+ .release = obd_class_release, /* release */
+};
+
+/* modules setup */
+struct miscdevice obd_psdev = {
+ .minor = OBD_DEV_MINOR,
+ .name = OBD_DEV_NAME,
+ .fops = &obd_psdev_fops,
+};
+
+
+#if defined (CONFIG_PROC_FS)
+static int obd_proc_version_seq_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "lustre: %s\nkernel: %s\nbuild: %s\n",
+ LUSTRE_VERSION_STRING, "patchless_client", BUILD_VERSION);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_version);
+
+int obd_proc_pinger_seq_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%s\n", "on");
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_pinger);
+
+static int obd_proc_health_seq_show(struct seq_file *m, void *v)
+{
+ bool healthy = true;
+ int i;
+
+ if (libcfs_catastrophe)
+ seq_printf(m, "LBUG\n");
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd;
+
+ obd = class_num2obd(i);
+ if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+ continue;
+
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ if (obd->obd_stopping)
+ continue;
+
+ class_incref(obd, __func__, current);
+ read_unlock(&obd_dev_lock);
+
+ if (obd_health_check(NULL, obd)) {
+ seq_printf(m, "device %s reported unhealthy\n",
+ obd->obd_name);
+ healthy = false;
+ }
+ class_decref(obd, __func__, current);
+ read_lock(&obd_dev_lock);
+ }
+ read_unlock(&obd_dev_lock);
+
+ if (healthy)
+ seq_puts(m, "healthy\n");
+ else
+ seq_puts(m, "NOT HEALTHY\n");
+
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_health);
+
+static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%s\n", obd_jobid_var);
+ return 0;
+}
+
+static ssize_t obd_proc_jobid_var_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+ return -EINVAL;
+
+ memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+
+ /* This might leave the var invalid on error, which is probably fine.*/
+ if (copy_from_user(obd_jobid_var, buffer, count))
+ return -EFAULT;
+
+ /* Trim the trailing '\n' if any */
+ if (obd_jobid_var[count - 1] == '\n')
+ obd_jobid_var[count - 1] = 0;
+
+ return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_var);
+
+static int obd_proc_jobid_name_seq_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%s\n", obd_jobid_var);
+ return 0;
+}
+
+static ssize_t obd_proc_jobid_name_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ if (!count || count > JOBSTATS_JOBID_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(obd_jobid_node, buffer, count))
+ return -EFAULT;
+
+ obd_jobid_node[count] = 0;
+
+ /* Trim the trailing '\n' if any */
+ if (obd_jobid_node[count - 1] == '\n')
+ obd_jobid_node[count - 1] = 0;
+
+ return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_name);
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+ { "version", &obd_proc_version_fops },
+ { "pinger", &obd_proc_pinger_fops },
+ { "health_check", &obd_proc_health_fops },
+ { "jobid_var", &obd_proc_jobid_var_fops },
+ { .name = "jobid_name",
+ .fops = &obd_proc_jobid_name_fops},
+ { NULL }
+};
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+ if (*pos >= class_devno_max())
+ return NULL;
+
+ return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (*pos >= class_devno_max())
+ return NULL;
+
+ return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+ loff_t index = *(loff_t *)v;
+ struct obd_device *obd = class_num2obd((int)index);
+ char *status;
+
+ if (obd == NULL)
+ return 0;
+
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ if (obd->obd_stopping)
+ status = "ST";
+ else if (obd->obd_inactive)
+ status = "IN";
+ else if (obd->obd_set_up)
+ status = "UP";
+ else if (obd->obd_attached)
+ status = "AT";
+ else
+ status = "--";
+
+ seq_printf(p, "%3d %s %s %s %s %d\n",
+ (int)index, status, obd->obd_type->typ_name,
+ obd->obd_name, obd->obd_uuid.uuid,
+ atomic_read(&obd->obd_refcount));
+ return 0;
+}
+
+struct seq_operations obd_device_list_sops = {
+ .start = obd_device_list_seq_start,
+ .stop = obd_device_list_seq_stop,
+ .next = obd_device_list_seq_next,
+ .show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = seq_open(file, &obd_device_list_sops);
+
+ if (rc)
+ return rc;
+
+ seq = file->private_data;
+ seq->private = PDE_DATA(inode);
+
+ return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+ .owner = THIS_MODULE,
+ .open = obd_device_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+ int rc = 0;
+
+ proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+ lprocfs_base, NULL);
+ if (IS_ERR(proc_lustre_root)) {
+ rc = PTR_ERR(proc_lustre_root);
+ proc_lustre_root = NULL;
+ goto out;
+ }
+
+ rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+ &obd_device_list_fops, NULL);
+out:
+ if (rc)
+ CERROR("error adding /proc/fs/lustre/devices file\n");
+ return 0;
+}
+
+int class_procfs_clean(void)
+{
+ if (proc_lustre_root) {
+ lprocfs_remove(&proc_lustre_root);
+ }
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 000000000..62ed706b1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include "../../include/obd_class.h"
+#include "../../include/lustre/lustre_idl.h"
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+ u32 newvalid = 0;
+
+ if (valid & LA_ATIME) {
+ dst->o_atime = la->la_atime;
+ newvalid |= OBD_MD_FLATIME;
+ }
+ if (valid & LA_MTIME) {
+ dst->o_mtime = la->la_mtime;
+ newvalid |= OBD_MD_FLMTIME;
+ }
+ if (valid & LA_CTIME) {
+ dst->o_ctime = la->la_ctime;
+ newvalid |= OBD_MD_FLCTIME;
+ }
+ if (valid & LA_SIZE) {
+ dst->o_size = la->la_size;
+ newvalid |= OBD_MD_FLSIZE;
+ }
+ if (valid & LA_BLOCKS) { /* allocation of space (x512 bytes) */
+ dst->o_blocks = la->la_blocks;
+ newvalid |= OBD_MD_FLBLOCKS;
+ }
+ if (valid & LA_TYPE) {
+ dst->o_mode = (dst->o_mode & S_IALLUGO) |
+ (la->la_mode & S_IFMT);
+ newvalid |= OBD_MD_FLTYPE;
+ }
+ if (valid & LA_MODE) {
+ dst->o_mode = (dst->o_mode & S_IFMT) |
+ (la->la_mode & S_IALLUGO);
+ newvalid |= OBD_MD_FLMODE;
+ }
+ if (valid & LA_UID) {
+ dst->o_uid = la->la_uid;
+ newvalid |= OBD_MD_FLUID;
+ }
+ if (valid & LA_GID) {
+ dst->o_gid = la->la_gid;
+ newvalid |= OBD_MD_FLGID;
+ }
+ dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, u32 valid)
+{
+ __u64 newvalid = 0;
+
+ valid &= obdo->o_valid;
+
+ if (valid & OBD_MD_FLATIME) {
+ dst->la_atime = obdo->o_atime;
+ newvalid |= LA_ATIME;
+ }
+ if (valid & OBD_MD_FLMTIME) {
+ dst->la_mtime = obdo->o_mtime;
+ newvalid |= LA_MTIME;
+ }
+ if (valid & OBD_MD_FLCTIME) {
+ dst->la_ctime = obdo->o_ctime;
+ newvalid |= LA_CTIME;
+ }
+ if (valid & OBD_MD_FLSIZE) {
+ dst->la_size = obdo->o_size;
+ newvalid |= LA_SIZE;
+ }
+ if (valid & OBD_MD_FLBLOCKS) {
+ dst->la_blocks = obdo->o_blocks;
+ newvalid |= LA_BLOCKS;
+ }
+ if (valid & OBD_MD_FLTYPE) {
+ dst->la_mode = (dst->la_mode & S_IALLUGO) |
+ (obdo->o_mode & S_IFMT);
+ newvalid |= LA_TYPE;
+ }
+ if (valid & OBD_MD_FLMODE) {
+ dst->la_mode = (dst->la_mode & S_IFMT) |
+ (obdo->o_mode & S_IALLUGO);
+ newvalid |= LA_MODE;
+ }
+ if (valid & OBD_MD_FLUID) {
+ dst->la_uid = obdo->o_uid;
+ newvalid |= LA_UID;
+ }
+ if (valid & OBD_MD_FLGID) {
+ dst->la_gid = obdo->o_gid;
+ newvalid |= LA_GID;
+ }
+ dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid)
+{
+ valid &= src->o_valid;
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE,
+ "valid %#llx, cur time %lu/%lu, new %llu/%llu\n",
+ src->o_valid, LTIME_S(dst->i_mtime),
+ LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+ if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+ LTIME_S(dst->i_atime) = src->o_atime;
+ if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+ LTIME_S(dst->i_mtime) = src->o_mtime;
+ if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+ LTIME_S(dst->i_ctime) = src->o_ctime;
+ if (valid & OBD_MD_FLSIZE)
+ i_size_write(dst, src->o_size);
+ /* optimum IO size */
+ if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+ dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+ if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+ dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+ /* allocation of space */
+ if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+ /*
+ * XXX shouldn't overflow be checked here like in
+ * obdo_to_inode().
+ */
+ dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, u32 valid)
+{
+ valid &= src->o_valid;
+
+ LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+ OBD_MD_FLID | OBD_MD_FLGROUP)),
+ "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE,
+ "valid %#llx, cur time %lu/%lu, new %llu/%llu\n",
+ src->o_valid, LTIME_S(dst->i_mtime),
+ LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+ if (valid & OBD_MD_FLATIME)
+ LTIME_S(dst->i_atime) = src->o_atime;
+ if (valid & OBD_MD_FLMTIME)
+ LTIME_S(dst->i_mtime) = src->o_mtime;
+ if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+ LTIME_S(dst->i_ctime) = src->o_ctime;
+ if (valid & OBD_MD_FLSIZE)
+ i_size_write(dst, src->o_size);
+ if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+ dst->i_blocks = src->o_blocks;
+ if (dst->i_blocks < src->o_blocks) /* overflow */
+ dst->i_blocks = -1;
+
+ }
+ if (valid & OBD_MD_FLBLKSZ)
+ dst->i_blkbits = ffs(src->o_blksize)-1;
+ if (valid & OBD_MD_FLMODE)
+ dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+ if (valid & OBD_MD_FLUID)
+ dst->i_uid = make_kuid(&init_user_ns, src->o_uid);
+ if (valid & OBD_MD_FLGID)
+ dst->i_gid = make_kgid(&init_user_ns, src->o_gid);
+ if (valid & OBD_MD_FLFLAGS)
+ dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 000000000..4b62d2576
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
@@ -0,0 +1,405 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/obd_support.h"
+#include "../../include/lprocfs_status.h"
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *obd_table_header;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+ OBD_TIMEOUT = 3, /* RPC timeout before recovery/intr */
+ OBD_DUMP_ON_TIMEOUT, /* dump kernel debug log upon eviction */
+ OBD_MEMUSED, /* bytes currently OBD_ALLOCated */
+ OBD_PAGESUSED, /* pages currently OBD_PAGE_ALLOCated */
+ OBD_MAXMEMUSED, /* maximum bytes OBD_ALLOCated concurrently */
+ OBD_MAXPAGESUSED, /* maximum pages OBD_PAGE_ALLOCated concurrently */
+ OBD_SYNCFILTER, /* XXX temporary, as we play with sync osts.. */
+ OBD_LDLM_TIMEOUT, /* LDLM timeout for ASTs before client eviction */
+ OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */
+ OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+ OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */
+ OBD_MAX_DIRTY_PAGES, /* maximum dirty pages */
+ OBD_AT_MIN, /* Adaptive timeouts params */
+ OBD_AT_MAX,
+ OBD_AT_EXTRA,
+ OBD_AT_EARLY_MARGIN,
+ OBD_AT_HISTORY,
+};
+
+
+#ifdef CONFIG_SYSCTL
+static int proc_set_timeout(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ldlm_timeout >= obd_timeout)
+ ldlm_timeout = max(obd_timeout / 3, 1U);
+ return rc;
+}
+
+static int proc_memory_alloc(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char buf[22];
+ int len;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", obd_memory_sum());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+static int proc_pages_alloc(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char buf[22];
+ int len;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", obd_pages_sum());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+static int proc_mem_max(struct ctl_table *table, int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ char buf[22];
+ int len;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", obd_memory_max());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+static int proc_pages_max(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char buf[22];
+ int len;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), "%llu\n", obd_pages_max());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+static int proc_max_dirty_pages_in_mb(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+
+ if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ rc = lprocfs_write_frac_helper(buffer, *lenp,
+ (unsigned int *)table->data,
+ 1 << (20 - PAGE_CACHE_SHIFT));
+ /* Don't allow them to let dirty pages exceed 90% of system
+ * memory and set a hard minimum of 4MB. */
+ if (obd_max_dirty_pages > ((totalram_pages / 10) * 9)) {
+ CERROR("Refusing to set max dirty pages to %u, which is more than 90%% of available RAM; setting to %lu\n",
+ obd_max_dirty_pages,
+ ((totalram_pages / 10) * 9));
+ obd_max_dirty_pages = (totalram_pages / 10) * 9;
+ } else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+ obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+ }
+ } else {
+ char buf[21];
+ int len;
+
+ len = lprocfs_read_frac_helper(buf, sizeof(buf),
+ *(unsigned int *)table->data,
+ 1 << (20 - PAGE_CACHE_SHIFT));
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ }
+ *ppos += *lenp;
+ return rc;
+}
+
+static int proc_alloc_fail_rate(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+
+ if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ rc = lprocfs_write_frac_helper(buffer, *lenp,
+ (unsigned int *)table->data,
+ OBD_ALLOC_FAIL_MULT);
+ } else {
+ char buf[21];
+ int len;
+
+ len = lprocfs_read_frac_helper(buf, 21,
+ *(unsigned int *)table->data,
+ OBD_ALLOC_FAIL_MULT);
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ }
+ *ppos += *lenp;
+ return rc;
+}
+
+static struct ctl_table obd_table[] = {
+ {
+ .procname = "timeout",
+ .data = &obd_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_set_timeout
+ },
+ {
+ .procname = "debug_peer_on_timeout",
+ .data = &obd_debug_peer_on_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .procname = "dump_on_timeout",
+ .data = &obd_dump_on_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .procname = "dump_on_eviction",
+ .data = &obd_dump_on_eviction,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .procname = "memused",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_memory_alloc
+ },
+ {
+ .procname = "pagesused",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_pages_alloc
+ },
+ {
+ .procname = "memused_max",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_mem_max
+ },
+ {
+ .procname = "pagesused_max",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_pages_max
+ },
+ {
+ .procname = "ldlm_timeout",
+ .data = &ldlm_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_set_timeout
+ },
+ {
+ .procname = "alloc_fail_rate",
+ .data = &obd_alloc_fail_rate,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_alloc_fail_rate
+ },
+ {
+ .procname = "max_dirty_mb",
+ .data = &obd_max_dirty_pages,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_max_dirty_pages_in_mb
+ },
+ {
+ .procname = "at_min",
+ .data = &at_min,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "at_max",
+ .data = &at_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "at_extra",
+ .data = &at_extra,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "at_early_margin",
+ .data = &at_early_margin,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "at_history",
+ .data = &at_history,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {}
+};
+
+static struct ctl_table parent_table[] = {
+ {
+ .procname = "lustre",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = obd_table
+ },
+ {}
+};
+#endif
+
+void obd_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ if (!obd_table_header)
+ obd_table_header = register_sysctl_table(parent_table);
+#endif
+}
+
+void obd_sysctl_clean(void)
+{
+#ifdef CONFIG_SYSCTL
+ if (obd_table_header)
+ unregister_sysctl_table(obd_table_header);
+ obd_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644
index 000000000..114be4a78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog.c
@@ -0,0 +1,1007 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ * if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+static struct llog_handle *llog_alloc_handle(void)
+{
+ struct llog_handle *loghandle;
+
+ OBD_ALLOC_PTR(loghandle);
+ if (loghandle == NULL)
+ return NULL;
+
+ init_rwsem(&loghandle->lgh_lock);
+ spin_lock_init(&loghandle->lgh_hdr_lock);
+ INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+ atomic_set(&loghandle->lgh_refcount, 1);
+
+ return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+static void llog_free_handle(struct llog_handle *loghandle)
+{
+ LASSERT(loghandle != NULL);
+
+ /* failed llog_init_handle */
+ if (!loghandle->lgh_hdr)
+ goto out;
+
+ if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+ LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+ else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+ LASSERT(list_empty(&loghandle->u.chd.chd_head));
+ LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+ OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+ OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+ atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+ LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+ if (atomic_dec_and_test(&loghandle->lgh_refcount))
+ llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+ int index)
+{
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ int rc = 0;
+
+ CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+ index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+ if (index == 0) {
+ CERROR("Can't cancel index 0 which is header\n");
+ return -EINVAL;
+ }
+
+ spin_lock(&loghandle->lgh_hdr_lock);
+ if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+ return -ENOENT;
+ }
+
+ llh->llh_count--;
+
+ if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+ (llh->llh_count == 1) &&
+ (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ rc = llog_destroy(env, loghandle);
+ if (rc < 0) {
+ CERROR("%s: can't destroy empty llog #"DOSTID
+ "#%08x: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, rc);
+ goto out_err;
+ }
+ return 1;
+ }
+ spin_unlock(&loghandle->lgh_hdr_lock);
+
+ rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+ if (rc < 0) {
+ CERROR("%s: fail to write header for llog #"DOSTID
+ "#%08x: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, rc);
+ goto out_err;
+ }
+ return 0;
+out_err:
+ spin_lock(&loghandle->lgh_hdr_lock);
+ ext2_set_bit(index, llh->llh_bitmap);
+ llh->llh_count++;
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct obd_uuid *uuid)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ return rc;
+
+ if (lop->lop_read_header == NULL)
+ return -EOPNOTSUPP;
+
+ rc = lop->lop_read_header(env, handle);
+ if (rc == LLOG_EEMPTY) {
+ struct llog_log_hdr *llh = handle->lgh_hdr;
+
+ handle->lgh_last_idx = 0; /* header is record with index 0 */
+ llh->llh_count = 1; /* for the header record */
+ llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+ llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+ llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+ llh->llh_timestamp = get_seconds();
+ if (uuid)
+ memcpy(&llh->llh_tgtuuid, uuid,
+ sizeof(llh->llh_tgtuuid));
+ llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+ ext2_set_bit(0, llh->llh_bitmap);
+ rc = 0;
+ }
+ return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+ int flags, struct obd_uuid *uuid)
+{
+ struct llog_log_hdr *llh;
+ int rc;
+
+ LASSERT(handle->lgh_hdr == NULL);
+
+ OBD_ALLOC_PTR(llh);
+ if (llh == NULL)
+ return -ENOMEM;
+ handle->lgh_hdr = llh;
+ /* first assign flags to use llog_client_ops */
+ llh->llh_flags = flags;
+ rc = llog_read_header(env, handle, uuid);
+ if (rc == 0) {
+ if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+ flags & LLOG_F_IS_CAT) ||
+ (llh->llh_flags & LLOG_F_IS_CAT &&
+ flags & LLOG_F_IS_PLAIN))) {
+ CERROR("%s: llog type is %s but initializing %s\n",
+ handle->lgh_ctxt->loc_obd->obd_name,
+ llh->llh_flags & LLOG_F_IS_CAT ?
+ "catalog" : "plain",
+ flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+ rc = -EINVAL;
+ goto out;
+ } else if (llh->llh_flags &
+ (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+ /*
+ * it is possible to open llog without specifying llog
+ * type so it is taken from llh_flags
+ */
+ flags = llh->llh_flags;
+ } else {
+ /* for some reason the llh_flags has no type set */
+ CERROR("llog type is not specified!\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ if (unlikely(uuid &&
+ !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+ CERROR("%s: llog uuid mismatch: %s/%s\n",
+ handle->lgh_ctxt->loc_obd->obd_name,
+ (char *)uuid->uuid,
+ (char *)llh->llh_tgtuuid.uuid);
+ rc = -EEXIST;
+ goto out;
+ }
+ }
+ if (flags & LLOG_F_IS_CAT) {
+ LASSERT(list_empty(&handle->u.chd.chd_head));
+ INIT_LIST_HEAD(&handle->u.chd.chd_head);
+ llh->llh_size = sizeof(struct llog_logid_rec);
+ } else if (!(flags & LLOG_F_IS_PLAIN)) {
+ CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+ handle->lgh_ctxt->loc_obd->obd_name,
+ flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+ rc = -EINVAL;
+ }
+out:
+ if (rc) {
+ OBD_FREE_PTR(llh);
+ handle->lgh_hdr = NULL;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+static int llog_process_thread(void *arg)
+{
+ struct llog_process_info *lpi = arg;
+ struct llog_handle *loghandle = lpi->lpi_loghandle;
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ struct llog_process_cat_data *cd = lpi->lpi_catdata;
+ char *buf;
+ __u64 cur_offset = LLOG_CHUNK_SIZE;
+ __u64 last_offset;
+ int rc = 0, index = 1, last_index;
+ int saved_index = 0;
+ int last_called_index = 0;
+
+ LASSERT(llh);
+
+ OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+ if (!buf) {
+ lpi->lpi_rc = -ENOMEM;
+ return 0;
+ }
+
+ if (cd != NULL) {
+ last_called_index = cd->lpcd_first_idx;
+ index = cd->lpcd_first_idx + 1;
+ }
+ if (cd != NULL && cd->lpcd_last_idx)
+ last_index = cd->lpcd_last_idx;
+ else
+ last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+ while (rc == 0) {
+ struct llog_rec_hdr *rec;
+
+ /* skip records not set in bitmap */
+ while (index <= last_index &&
+ !ext2_test_bit(index, llh->llh_bitmap))
+ ++index;
+
+ LASSERT(index <= last_index + 1);
+ if (index == last_index + 1)
+ break;
+repeat:
+ CDEBUG(D_OTHER, "index: %d last_index %d\n",
+ index, last_index);
+
+ /* get the buf with our target record; avoid old garbage */
+ memset(buf, 0, LLOG_CHUNK_SIZE);
+ last_offset = cur_offset;
+ rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+ index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+ if (rc)
+ goto out;
+
+ /* NB: when rec->lrh_len is accessed it is already swabbed
+ * since it is used at the "end" of the loop and the rec
+ * swabbing is done at the beginning of the loop. */
+ for (rec = (struct llog_rec_hdr *)buf;
+ (char *)rec < buf + LLOG_CHUNK_SIZE;
+ rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+ CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+ rec, rec->lrh_type);
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+
+ CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+ rec->lrh_type, rec->lrh_index);
+
+ if (rec->lrh_index == 0) {
+ /* probably another rec just got added? */
+ rc = 0;
+ if (index <= loghandle->lgh_last_idx)
+ goto repeat;
+ goto out; /* no more records */
+ }
+ if (rec->lrh_len == 0 ||
+ rec->lrh_len > LLOG_CHUNK_SIZE) {
+ CWARN("invalid length %d in llog record for index %d/%d\n",
+ rec->lrh_len,
+ rec->lrh_index, index);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (rec->lrh_index < index) {
+ CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+ rec->lrh_index);
+ continue;
+ }
+
+ CDEBUG(D_OTHER,
+ "lrh_index: %d lrh_len: %d (%d remains)\n",
+ rec->lrh_index, rec->lrh_len,
+ (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+ loghandle->lgh_cur_idx = rec->lrh_index;
+ loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+ last_offset;
+
+ /* if set, process the callback on this record */
+ if (ext2_test_bit(index, llh->llh_bitmap)) {
+ rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+ lpi->lpi_cbdata);
+ last_called_index = index;
+ if (rc == LLOG_PROC_BREAK) {
+ goto out;
+ } else if (rc == LLOG_DEL_RECORD) {
+ llog_cancel_rec(lpi->lpi_env,
+ loghandle,
+ rec->lrh_index);
+ rc = 0;
+ }
+ if (rc)
+ goto out;
+ } else {
+ CDEBUG(D_OTHER, "Skipped index %d\n", index);
+ }
+
+ /* next record, still in buffer? */
+ ++index;
+ if (index > last_index) {
+ rc = 0;
+ goto out;
+ }
+ }
+ }
+
+out:
+ if (cd != NULL)
+ cd->lpcd_last_idx = last_called_index;
+
+ OBD_FREE(buf, LLOG_CHUNK_SIZE);
+ lpi->lpi_rc = rc;
+ return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+ struct llog_process_info *lpi = arg;
+ struct lu_env env;
+ int rc;
+
+ unshare_fs_struct();
+
+ /* client env has no keys, tags is just 0 */
+ rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+ if (rc)
+ goto out;
+ lpi->lpi_env = &env;
+
+ rc = llog_process_thread(arg);
+
+ lu_env_fini(&env);
+out:
+ complete(&lpi->lpi_completion);
+ return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+ struct llog_process_info *lpi;
+ int rc;
+
+ OBD_ALLOC_PTR(lpi);
+ if (lpi == NULL) {
+ CERROR("cannot alloc pointer\n");
+ return -ENOMEM;
+ }
+ lpi->lpi_loghandle = loghandle;
+ lpi->lpi_cb = cb;
+ lpi->lpi_cbdata = data;
+ lpi->lpi_catdata = catdata;
+
+ if (fork) {
+ /* The new thread can't use parent env,
+ * init the new one in llog_process_thread_daemonize. */
+ lpi->lpi_env = NULL;
+ init_completion(&lpi->lpi_completion);
+ rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+ "llog_process_thread"));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("%s: cannot start thread: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+ OBD_FREE_PTR(lpi);
+ return rc;
+ }
+ wait_for_completion(&lpi->lpi_completion);
+ } else {
+ lpi->lpi_env = env;
+ llog_process_thread(lpi);
+ }
+ rc = lpi->lpi_rc;
+ OBD_FREE_PTR(lpi);
+ return rc;
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata)
+{
+ return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+int llog_reverse_process(const struct lu_env *env,
+ struct llog_handle *loghandle, llog_cb_t cb,
+ void *data, void *catdata)
+{
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ struct llog_process_cat_data *cd = catdata;
+ void *buf;
+ int rc = 0, first_index = 1, index, idx;
+
+ OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+ if (!buf)
+ return -ENOMEM;
+
+ if (cd != NULL)
+ first_index = cd->lpcd_first_idx + 1;
+ if (cd != NULL && cd->lpcd_last_idx)
+ index = cd->lpcd_last_idx;
+ else
+ index = LLOG_BITMAP_BYTES * 8 - 1;
+
+ while (rc == 0) {
+ struct llog_rec_hdr *rec;
+ struct llog_rec_tail *tail;
+
+ /* skip records not set in bitmap */
+ while (index >= first_index &&
+ !ext2_test_bit(index, llh->llh_bitmap))
+ --index;
+
+ LASSERT(index >= first_index - 1);
+ if (index == first_index - 1)
+ break;
+
+ /* get the buf with our target record; avoid old garbage */
+ memset(buf, 0, LLOG_CHUNK_SIZE);
+ rc = llog_prev_block(env, loghandle, index, buf,
+ LLOG_CHUNK_SIZE);
+ if (rc)
+ goto out;
+
+ rec = buf;
+ idx = rec->lrh_index;
+ CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+ while (idx < index) {
+ rec = (void *)rec + rec->lrh_len;
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+ idx ++;
+ }
+ LASSERT(idx == index);
+ tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+ /* process records in buffer, starting where we found one */
+ while ((void *)tail > buf) {
+ if (tail->lrt_index == 0) {
+ /* no more records */
+ rc = 0;
+ goto out;
+ }
+
+ /* if set, process the callback on this record */
+ if (ext2_test_bit(index, llh->llh_bitmap)) {
+ rec = (void *)tail - tail->lrt_len +
+ sizeof(*tail);
+
+ rc = cb(env, loghandle, rec, data);
+ if (rc == LLOG_PROC_BREAK) {
+ goto out;
+ } else if (rc == LLOG_DEL_RECORD) {
+ llog_cancel_rec(env, loghandle,
+ tail->lrt_index);
+ rc = 0;
+ }
+ if (rc)
+ goto out;
+ }
+
+ /* previous record, still in buffer? */
+ --index;
+ if (index < first_index) {
+ rc = 0;
+ goto out;
+ }
+ tail = (void *)tail - tail->lrt_len;
+ }
+ }
+
+out:
+ if (buf)
+ OBD_FREE(buf, LLOG_CHUNK_SIZE);
+ return rc;
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ * llog_open - open llog, may not exist
+ * llog_exist - check if llog exists
+ * llog_close - close opened llog, pair for open, frees llog_handle
+ * llog_declare_create - declare llog creation
+ * llog_create - create new llog on disk, need transaction handle
+ * llog_declare_write_rec - declaration of llog write
+ * llog_write_rec - write llog record on disk, need transaction handle
+ * llog_declare_add - declare llog catalog record addition
+ * llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ return rc;
+ if (lop->lop_exist == NULL)
+ return -EOPNOTSUPP;
+
+ rc = lop->lop_exist(loghandle);
+ return rc;
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+ struct llog_handle *loghandle, struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ return rc;
+ if (lop->lop_declare_create == NULL)
+ return -EOPNOTSUPP;
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_declare_create(env, loghandle, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ return rc;
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+ struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ return rc;
+ if (lop->lop_create == NULL)
+ return -EOPNOTSUPP;
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_create(env, handle, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ return rc;
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, int idx,
+ struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ return rc;
+ LASSERT(lop);
+ if (lop->lop_declare_write_rec == NULL)
+ return -EOPNOTSUPP;
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ return rc;
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ int numcookies, void *buf, int idx, struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc, buflen;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ return rc;
+
+ LASSERT(lop);
+ if (lop->lop_write_rec == NULL)
+ return -EOPNOTSUPP;
+
+ if (buf)
+ buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+ sizeof(struct llog_rec_tail);
+ else
+ buflen = rec->lrh_len;
+ LASSERT(cfs_size_round(buflen) == buflen);
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+ buf, idx, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ return rc;
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ void *buf, struct thandle *th)
+{
+ int raised, rc;
+
+ if (lgh->lgh_logops->lop_add == NULL)
+ return -EOPNOTSUPP;
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ return rc;
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct thandle *th)
+{
+ int raised, rc;
+
+ if (lgh->lgh_logops->lop_declare_add == NULL)
+ return -EOPNOTSUPP;
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ return rc;
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **res, struct llog_logid *logid,
+ char *name)
+{
+ struct dt_device *d;
+ struct thandle *th;
+ int rc;
+
+ rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+ if (rc)
+ return rc;
+
+ if (llog_exist(*res))
+ return 0;
+
+ LASSERT((*res)->lgh_obj != NULL);
+
+ d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+ th = dt_trans_create(env, d);
+ if (IS_ERR(th)) {
+ rc = PTR_ERR(th);
+ goto out;
+ }
+
+ rc = llog_declare_create(env, *res, th);
+ if (rc == 0) {
+ rc = dt_trans_start_local(env, d, th);
+ if (rc == 0)
+ rc = llog_create(env, *res, th);
+ }
+ dt_trans_stop(env, d, th);
+out:
+ if (rc)
+ llog_close(env, *res);
+ return rc;
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_logid *logid, char *name)
+{
+ struct llog_handle *handle;
+ int rc = 0, rc2;
+
+ /* nothing to erase */
+ if (name == NULL && logid == NULL)
+ return 0;
+
+ rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+ if (rc < 0)
+ return rc;
+
+ rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+ if (rc == 0)
+ rc = llog_destroy(env, handle);
+
+ rc2 = llog_close(env, handle);
+ if (rc == 0)
+ rc = rc2;
+ return rc;
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ int cookiecount, void *buf, int idx)
+{
+ struct dt_device *dt;
+ struct thandle *th;
+ int rc;
+
+ LASSERT(loghandle);
+ LASSERT(loghandle->lgh_ctxt);
+ LASSERT(loghandle->lgh_obj != NULL);
+
+ dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+ th = dt_trans_create(env, dt);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+ if (rc)
+ goto out_trans;
+
+ rc = dt_trans_start_local(env, dt, th);
+ if (rc)
+ goto out_trans;
+
+ down_write(&loghandle->lgh_lock);
+ rc = llog_write_rec(env, loghandle, rec, reccookie,
+ cookiecount, buf, idx, th);
+ up_write(&loghandle->lgh_lock);
+out_trans:
+ dt_trans_stop(env, dt, th);
+ return rc;
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **lgh, struct llog_logid *logid,
+ char *name, enum llog_open_param open_param)
+{
+ int raised;
+ int rc;
+
+ LASSERT(ctxt);
+ LASSERT(ctxt->loc_logops);
+
+ if (ctxt->loc_logops->lop_open == NULL) {
+ *lgh = NULL;
+ return -EOPNOTSUPP;
+ }
+
+ *lgh = llog_alloc_handle();
+ if (*lgh == NULL)
+ return -ENOMEM;
+ (*lgh)->lgh_ctxt = ctxt;
+ (*lgh)->lgh_logops = ctxt->loc_logops;
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ if (rc) {
+ llog_free_handle(*lgh);
+ *lgh = NULL;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ goto out;
+ if (lop->lop_close == NULL) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+ rc = lop->lop_close(env, loghandle);
+out:
+ llog_handle_put(loghandle);
+ return rc;
+}
+EXPORT_SYMBOL(llog_close);
+
+int llog_is_empty(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name)
+{
+ struct llog_handle *llh;
+ int rc = 0;
+
+ rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc < 0) {
+ if (likely(rc == -ENOENT))
+ rc = 0;
+ goto out;
+ }
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ goto out_close;
+ rc = llog_get_size(llh);
+
+out_close:
+ llog_close(env, llh);
+out:
+ /* header is record 1 */
+ return rc <= 1;
+}
+EXPORT_SYMBOL(llog_is_empty);
+
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_handle *copy_llh = data;
+
+ /* Append all records */
+ return llog_write(env, copy_llh, rec, NULL, 0, NULL, -1);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+/* backup plain llog */
+int llog_backup(const struct lu_env *env, struct obd_device *obd,
+ struct llog_ctxt *ctxt, struct llog_ctxt *bctxt,
+ char *name, char *backup)
+{
+ struct llog_handle *llh, *bllh;
+ int rc;
+
+
+
+ /* open original log */
+ rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc < 0) {
+ /* the -ENOENT case is also reported to the caller
+ * but silently so it should handle that if needed.
+ */
+ if (rc != -ENOENT)
+ CERROR("%s: failed to open log %s: rc = %d\n",
+ obd->obd_name, name, rc);
+ return rc;
+ }
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ goto out_close;
+
+ /* Make sure there's no old backup log */
+ rc = llog_erase(env, bctxt, NULL, backup);
+ if (rc < 0 && rc != -ENOENT)
+ goto out_close;
+
+ /* open backup log */
+ rc = llog_open_create(env, bctxt, &bllh, NULL, backup);
+ if (rc) {
+ CERROR("%s: failed to open backup logfile %s: rc = %d\n",
+ obd->obd_name, backup, rc);
+ goto out_close;
+ }
+
+ /* check that backup llog is not the same object as original one */
+ if (llh->lgh_obj == bllh->lgh_obj) {
+ CERROR("%s: backup llog %s to itself (%s), objects %p/%p\n",
+ obd->obd_name, name, backup, llh->lgh_obj,
+ bllh->lgh_obj);
+ rc = -EEXIST;
+ goto out_backup;
+ }
+
+ rc = llog_init_handle(env, bllh, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ goto out_backup;
+
+ /* Copy log record by record */
+ rc = llog_process_or_fork(env, llh, llog_copy_handler, (void *)bllh,
+ NULL, false);
+ if (rc)
+ CERROR("%s: failed to backup log %s: rc = %d\n",
+ obd->obd_name, name, rc);
+out_backup:
+ llog_close(env, bllh);
+out_close:
+ llog_close(env, llh);
+ return rc;
+}
+EXPORT_SYMBOL(llog_backup);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644
index 000000000..c8f6ab006
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
@@ -0,0 +1,815 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ * if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/obd_class.h"
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+ struct llog_handle *cathandle,
+ struct llog_handle *loghandle,
+ struct thandle *th)
+{
+
+ struct llog_log_hdr *llh;
+ struct llog_logid_rec rec = { { 0 }, };
+ int rc, index, bitmap_size;
+
+ llh = cathandle->lgh_hdr;
+ bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+ index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+ /* maximum number of available slots in catlog is bitmap_size - 2 */
+ if (llh->llh_cat_idx == index) {
+ CERROR("no free catalog slots for log...\n");
+ return -ENOSPC;
+ }
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+ return -ENOSPC;
+
+ rc = llog_create(env, loghandle, th);
+ /* if llog is already created, no need to initialize it */
+ if (rc == -EEXIST) {
+ return 0;
+ } else if (rc != 0) {
+ CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+ return rc;
+ }
+
+ rc = llog_init_handle(env, loghandle,
+ LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+ &cathandle->lgh_hdr->llh_tgtuuid);
+ if (rc)
+ goto out_destroy;
+
+ if (index == 0)
+ index = 1;
+
+ spin_lock(&loghandle->lgh_hdr_lock);
+ llh->llh_count++;
+ if (ext2_set_bit(index, llh->llh_bitmap)) {
+ CERROR("argh, index %u already set in log bitmap?\n",
+ index);
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ LBUG(); /* should never happen */
+ }
+ spin_unlock(&loghandle->lgh_hdr_lock);
+
+ cathandle->lgh_last_idx = index;
+ llh->llh_tail.lrt_index = index;
+
+ CDEBUG(D_RPCTRACE,
+ "new recovery log "DOSTID":%x for index %u of catalog"
+ DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, index,
+ POSTID(&cathandle->lgh_id.lgl_oi));
+ /* build the record for this log in the catalog */
+ rec.lid_hdr.lrh_len = sizeof(rec);
+ rec.lid_hdr.lrh_index = index;
+ rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+ rec.lid_id = loghandle->lgh_id;
+ rec.lid_tail.lrt_len = sizeof(rec);
+ rec.lid_tail.lrt_index = index;
+
+ /* update the catalog: header and record */
+ rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+ &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+ if (rc < 0)
+ goto out_destroy;
+
+ loghandle->lgh_hdr->llh_cat_idx = index;
+ return 0;
+out_destroy:
+ llog_destroy(env, loghandle);
+ return rc;
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle **res, struct llog_logid *logid)
+{
+ struct llog_handle *loghandle;
+ int rc = 0;
+
+ if (cathandle == NULL)
+ return -EBADF;
+
+ down_write(&cathandle->lgh_lock);
+ list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+ u.phd.phd_entry) {
+ struct llog_logid *cgl = &loghandle->lgh_id;
+
+ if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+ ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+ if (cgl->lgl_ogen != logid->lgl_ogen) {
+ CERROR("%s: log "DOSTID" generation %x != %x\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+ logid->lgl_ogen);
+ continue;
+ }
+ loghandle->u.phd.phd_cat_handle = cathandle;
+ up_write(&cathandle->lgh_lock);
+ rc = 0;
+ goto out;
+ }
+ }
+ up_write(&cathandle->lgh_lock);
+
+ rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+ LLOG_OPEN_EXISTS);
+ if (rc < 0) {
+ CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+ return rc;
+ }
+
+ rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+ if (rc < 0) {
+ llog_close(env, loghandle);
+ loghandle = NULL;
+ return rc;
+ }
+
+ down_write(&cathandle->lgh_lock);
+ list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+ up_write(&cathandle->lgh_lock);
+
+ loghandle->u.phd.phd_cat_handle = cathandle;
+ loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+ loghandle->u.phd.phd_cookie.lgc_index =
+ loghandle->lgh_hdr->llh_cat_idx;
+out:
+ llog_handle_get(loghandle);
+ *res = loghandle;
+ return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+ struct llog_handle *loghandle, *n;
+ int rc;
+
+ list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+ u.phd.phd_entry) {
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ int index;
+
+ /* unlink open-not-created llogs */
+ list_del_init(&loghandle->u.phd.phd_entry);
+ llh = loghandle->lgh_hdr;
+ if (loghandle->lgh_obj != NULL && llh != NULL &&
+ (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+ (llh->llh_count == 1)) {
+ rc = llog_destroy(env, loghandle);
+ if (rc)
+ CERROR("%s: failure destroying log during cleanup: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ rc);
+
+ index = loghandle->u.phd.phd_cookie.lgc_index;
+ llog_cat_cleanup(env, cathandle, NULL, index);
+ }
+ llog_close(env, loghandle);
+ }
+ /* if handle was stored in ctxt, remove it too */
+ if (cathandle->lgh_ctxt->loc_handle == cathandle)
+ cathandle->lgh_ctxt->loc_handle = NULL;
+ rc = llog_close(env, cathandle);
+ return rc;
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+ LLOGH_CAT,
+ LLOGH_LOG
+};
+
+/** Return the currently active log handle. If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+ struct thandle *th)
+{
+ struct llog_handle *loghandle = NULL;
+
+ down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+ loghandle = cathandle->u.chd.chd_current_log;
+ if (loghandle) {
+ struct llog_log_hdr *llh;
+
+ down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+ llh = loghandle->lgh_hdr;
+ if (llh == NULL ||
+ loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+ up_read(&cathandle->lgh_lock);
+ return loghandle;
+ } else {
+ up_write(&loghandle->lgh_lock);
+ }
+ }
+ up_read(&cathandle->lgh_lock);
+
+ /* time to use next log */
+
+ /* first, we have to make sure the state hasn't changed */
+ down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+ loghandle = cathandle->u.chd.chd_current_log;
+ if (loghandle) {
+ struct llog_log_hdr *llh;
+
+ down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+ llh = loghandle->lgh_hdr;
+ LASSERT(llh);
+ if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+ up_write(&cathandle->lgh_lock);
+ return loghandle;
+ } else {
+ up_write(&loghandle->lgh_lock);
+ }
+ }
+
+ CDEBUG(D_INODE, "use next log\n");
+
+ loghandle = cathandle->u.chd.chd_next_log;
+ cathandle->u.chd.chd_current_log = loghandle;
+ cathandle->u.chd.chd_next_log = NULL;
+ down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+ up_write(&cathandle->lgh_lock);
+ LASSERT(loghandle);
+ return loghandle;
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf, struct thandle *th)
+{
+ struct llog_handle *loghandle;
+ int rc;
+
+ LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+ loghandle = llog_cat_current_log(cathandle, th);
+ LASSERT(!IS_ERR(loghandle));
+
+ /* loghandle is already locked by llog_cat_current_log() for us */
+ if (!llog_exist(loghandle)) {
+ rc = llog_cat_new_log(env, cathandle, loghandle, th);
+ if (rc < 0) {
+ up_write(&loghandle->lgh_lock);
+ return rc;
+ }
+ }
+ /* now let's try to add the record */
+ rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+ if (rc < 0)
+ CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+ "llog_write_rec %d: lh=%p\n", rc, loghandle);
+ up_write(&loghandle->lgh_lock);
+ if (rc == -ENOSPC) {
+ /* try to use next log */
+ loghandle = llog_cat_current_log(cathandle, th);
+ LASSERT(!IS_ERR(loghandle));
+ /* new llog can be created concurrently */
+ if (!llog_exist(loghandle)) {
+ rc = llog_cat_new_log(env, cathandle, loghandle, th);
+ if (rc < 0) {
+ up_write(&loghandle->lgh_lock);
+ return rc;
+ }
+ }
+ /* now let's try to add the record */
+ rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+ -1, th);
+ if (rc < 0)
+ CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+ up_write(&loghandle->lgh_lock);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+ struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct thandle *th)
+{
+ struct llog_handle *loghandle, *next;
+ int rc = 0;
+
+ if (cathandle->u.chd.chd_current_log == NULL) {
+ /* declare new plain llog */
+ down_write(&cathandle->lgh_lock);
+ if (cathandle->u.chd.chd_current_log == NULL) {
+ rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+ NULL, NULL, LLOG_OPEN_NEW);
+ if (rc == 0) {
+ cathandle->u.chd.chd_current_log = loghandle;
+ list_add_tail(&loghandle->u.phd.phd_entry,
+ &cathandle->u.chd.chd_head);
+ }
+ }
+ up_write(&cathandle->lgh_lock);
+ } else if (cathandle->u.chd.chd_next_log == NULL) {
+ /* declare next plain llog */
+ down_write(&cathandle->lgh_lock);
+ if (cathandle->u.chd.chd_next_log == NULL) {
+ rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+ NULL, NULL, LLOG_OPEN_NEW);
+ if (rc == 0) {
+ cathandle->u.chd.chd_next_log = loghandle;
+ list_add_tail(&loghandle->u.phd.phd_entry,
+ &cathandle->u.chd.chd_head);
+ }
+ }
+ up_write(&cathandle->lgh_lock);
+ }
+ if (rc)
+ goto out;
+
+ if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+ rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+ th);
+ if (rc)
+ goto out;
+ llog_declare_write_rec(env, cathandle, NULL, -1, th);
+ }
+ /* declare records in the llogs */
+ rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+ rec, -1, th);
+ if (rc)
+ goto out;
+
+ next = cathandle->u.chd.chd_next_log;
+ if (next) {
+ if (!llog_exist(next)) {
+ rc = llog_declare_create(env, next, th);
+ llog_declare_write_rec(env, cathandle, NULL, -1, th);
+ }
+ llog_declare_write_rec(env, next, rec, -1, th);
+ }
+out:
+ return rc;
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf)
+{
+ struct llog_ctxt *ctxt;
+ struct dt_device *dt;
+ struct thandle *th = NULL;
+ int rc;
+
+ ctxt = cathandle->lgh_ctxt;
+ LASSERT(ctxt);
+ LASSERT(ctxt->loc_exp);
+
+ if (cathandle->lgh_obj != NULL) {
+ dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+ LASSERT(dt);
+
+ th = dt_trans_create(env, dt);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+ if (rc)
+ goto out_trans;
+
+ rc = dt_trans_start_local(env, dt, th);
+ if (rc)
+ goto out_trans;
+ rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+ dt_trans_stop(env, dt, th);
+ } else { /* lvfs compat code */
+ LASSERT(cathandle->lgh_file != NULL);
+ rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+ if (rc == 0)
+ rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+ buf, th);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+ struct llog_handle *cathandle, int count,
+ struct llog_cookie *cookies)
+{
+ int i, index, rc = 0, failed = 0;
+
+ for (i = 0; i < count; i++, cookies++) {
+ struct llog_handle *loghandle;
+ struct llog_logid *lgl = &cookies->lgc_lgl;
+ int lrc;
+
+ rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lgl->lgl_oi), rc);
+ failed++;
+ continue;
+ }
+
+ lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+ if (lrc == 1) { /* log has been destroyed */
+ index = loghandle->u.phd.phd_cookie.lgc_index;
+ rc = llog_cat_cleanup(env, cathandle, loghandle,
+ index);
+ } else if (lrc == -ENOENT) {
+ if (rc == 0) /* ENOENT shouldn't rewrite any error */
+ rc = lrc;
+ } else if (lrc < 0) {
+ failed++;
+ rc = lrc;
+ }
+ llog_handle_put(loghandle);
+ }
+ if (rc)
+ CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+ rc);
+
+ return rc;
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+static int llog_cat_process_cb(const struct lu_env *env,
+ struct llog_handle *cat_llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_process_data *d = data;
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *llh;
+ int rc;
+
+ if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ return -EINVAL;
+ }
+ CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+ DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+ rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cat_llh->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lir->lid_id.lgl_oi), rc);
+ return rc;
+ }
+
+ if (rec->lrh_index < d->lpd_startcat)
+ /* Skip processing of the logs until startcat */
+ rc = 0;
+ else if (d->lpd_startidx > 0) {
+ struct llog_process_cat_data cd;
+
+ cd.lpcd_first_idx = d->lpd_startidx;
+ cd.lpcd_last_idx = 0;
+ rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+ &cd, false);
+ /* Continue processing the next log from idx 0 */
+ d->lpd_startidx = 0;
+ } else {
+ rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+ NULL, false);
+ }
+
+ llog_handle_put(llh);
+
+ return rc;
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+ struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data, int startcat,
+ int startidx, bool fork)
+{
+ struct llog_process_data d;
+ struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+ int rc;
+
+ LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+ d.lpd_data = data;
+ d.lpd_cb = cb;
+ d.lpd_startcat = startcat;
+ d.lpd_startidx = startidx;
+
+ if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+ struct llog_process_cat_data cd;
+
+ CWARN("catlog "DOSTID" crosses index zero\n",
+ POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ cd.lpcd_first_idx = llh->llh_cat_idx;
+ cd.lpcd_last_idx = 0;
+ rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+ &d, &cd, fork);
+ if (rc != 0)
+ return rc;
+
+ cd.lpcd_first_idx = 0;
+ cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+ rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+ &d, &cd, fork);
+ } else {
+ rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+ &d, NULL, fork);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data, int startcat, int startidx)
+{
+ return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+ startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+ struct llog_handle *cat_llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_process_data *d = data;
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *llh;
+ int rc;
+
+ if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ return -EINVAL;
+ }
+ CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+ DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+ le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cat_llh->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lir->lid_id.lgl_oi), rc);
+ return rc;
+ }
+
+ rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+ llog_handle_put(llh);
+ return rc;
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+ struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data)
+{
+ struct llog_process_data d;
+ struct llog_process_cat_data cd;
+ struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+ int rc;
+
+ LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+ d.lpd_data = data;
+ d.lpd_cb = cb;
+
+ if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+ CWARN("catalog "DOSTID" crosses index zero\n",
+ POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ cd.lpcd_first_idx = 0;
+ cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+ rc = llog_reverse_process(env, cat_llh,
+ llog_cat_reverse_process_cb,
+ &d, &cd);
+ if (rc != 0)
+ return rc;
+
+ cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+ cd.lpcd_last_idx = 0;
+ rc = llog_reverse_process(env, cat_llh,
+ llog_cat_reverse_process_cb,
+ &d, &cd);
+ } else {
+ rc = llog_reverse_process(env, cat_llh,
+ llog_cat_reverse_process_cb,
+ &d, NULL);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+static int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+ struct llog_log_hdr *llh = cathandle->lgh_hdr;
+ int i, bitmap_size, idx;
+
+ bitmap_size = LLOG_BITMAP_SIZE(llh);
+ if (llh->llh_cat_idx == (index - 1)) {
+ idx = llh->llh_cat_idx + 1;
+ llh->llh_cat_idx = idx;
+ if (idx == cathandle->lgh_last_idx)
+ goto out;
+ for (i = (index + 1) % bitmap_size;
+ i != cathandle->lgh_last_idx;
+ i = (i + 1) % bitmap_size) {
+ if (!ext2_test_bit(i, llh->llh_bitmap)) {
+ idx = llh->llh_cat_idx + 1;
+ llh->llh_cat_idx = idx;
+ } else if (i == 0) {
+ llh->llh_cat_idx = 0;
+ } else {
+ break;
+ }
+ }
+out:
+ CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+ POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+ }
+
+ return 0;
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle *loghandle, int index)
+{
+ int rc;
+
+ LASSERT(index);
+ if (loghandle != NULL) {
+ /* remove destroyed llog from catalog list and
+ * chd_current_log variable */
+ down_write(&cathandle->lgh_lock);
+ if (cathandle->u.chd.chd_current_log == loghandle)
+ cathandle->u.chd.chd_current_log = NULL;
+ list_del_init(&loghandle->u.phd.phd_entry);
+ up_write(&cathandle->lgh_lock);
+ LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+ /* llog was opened and keep in a list, close it now */
+ llog_close(env, loghandle);
+ }
+ /* remove plain llog entry from catalog by index */
+ llog_cat_set_first_idx(cathandle, index);
+ rc = llog_cancel_rec(env, cathandle, index);
+ if (rc == 0)
+ CDEBUG(D_HA, "cancel plain log at index %u of catalog " DOSTID "\n",
+ index, POSTID(&cathandle->lgh_id.lgl_oi));
+ return rc;
+}
+
+static int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *loghandle;
+ struct llog_log_hdr *llh;
+ int rc;
+
+ if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ return -EINVAL;
+ }
+
+ CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+ DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+ rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+ rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lir->lid_id.lgl_oi), rc);
+ if (rc == -ENOENT || rc == -ESTALE) {
+ /* remove index from catalog */
+ llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+ }
+ return rc;
+ }
+
+ llh = loghandle->lgh_hdr;
+ if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+ (llh->llh_count == 1)) {
+ rc = llog_destroy(env, loghandle);
+ if (rc)
+ CERROR("%s: fail to destroy empty log: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+ llog_cat_cleanup(env, cathandle, loghandle,
+ loghandle->u.phd.phd_cookie.lgc_index);
+ }
+ llog_handle_put(loghandle);
+
+ return rc;
+}
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+ struct llog_handle *llh)
+{
+ int rc;
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+ if (rc)
+ return rc;
+
+ rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+ if (rc)
+ CERROR("%s: llog_process() with cat_cancel_cb failed: rc = %d\n",
+ llh->lgh_ctxt->loc_obd->obd_name, rc);
+ return 0;
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644
index 000000000..5332131a2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include "../include/lustre_log.h"
+
+struct llog_process_info {
+ struct llog_handle *lpi_loghandle;
+ llog_cb_t lpi_cb;
+ void *lpi_cbdata;
+ void *lpi_catdata;
+ int lpi_rc;
+ struct completion lpi_completion;
+ const struct lu_env *lpi_env;
+
+};
+
+struct llog_thread_info {
+ struct lu_attr lgi_attr;
+ struct lu_fid lgi_fid;
+ struct dt_object_format lgi_dof;
+ struct lu_buf lgi_buf;
+ loff_t lgi_off;
+ struct llog_rec_hdr lgi_lrh;
+ struct llog_rec_tail lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+ struct llog_thread_info *lgi;
+
+ lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+ LASSERT(lgi);
+ return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+ ostid_set_seq_llog(&logid->lgl_oi);
+ ostid_set_id(&logid->lgl_oi, ino);
+ logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle *loghandle, int index);
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644
index 000000000..978d886a1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
@@ -0,0 +1,262 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt *llog_new_ctxt(struct obd_device *obd)
+{
+ struct llog_ctxt *ctxt;
+
+ OBD_ALLOC_PTR(ctxt);
+ if (!ctxt)
+ return NULL;
+
+ ctxt->loc_obd = obd;
+ atomic_set(&ctxt->loc_refcount, 1);
+
+ return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+ if (ctxt->loc_exp) {
+ class_export_put(ctxt->loc_exp);
+ ctxt->loc_exp = NULL;
+ }
+ if (ctxt->loc_imp) {
+ class_import_put(ctxt->loc_imp);
+ ctxt->loc_imp = NULL;
+ }
+ OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+ struct obd_llog_group *olg = ctxt->loc_olg;
+ struct obd_device *obd;
+ int rc = 0;
+
+ spin_lock(&olg->olg_lock);
+ if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+ spin_unlock(&olg->olg_lock);
+ return rc;
+ }
+ olg->olg_ctxts[ctxt->loc_idx] = NULL;
+ spin_unlock(&olg->olg_lock);
+
+ obd = ctxt->loc_obd;
+ spin_lock(&obd->obd_dev_lock);
+ /* sync with llog ctxt user thread */
+ spin_unlock(&obd->obd_dev_lock);
+
+ /* obd->obd_starting is needed for the case of cleanup
+ * in error case while obd is starting up. */
+ LASSERTF(obd->obd_starting == 1 ||
+ obd->obd_stopping == 1 || obd->obd_set_up == 0,
+ "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+ !!obd->obd_stopping, !!obd->obd_set_up);
+
+ /* cleanup the llog ctxt here */
+ if (CTXTP(ctxt, cleanup))
+ rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+ llog_ctxt_destroy(ctxt);
+ wake_up(&olg->olg_waitq);
+ return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+ struct obd_llog_group *olg;
+ int rc, idx;
+
+ LASSERT(ctxt != NULL);
+ LASSERT(ctxt != LP_POISON);
+
+ olg = ctxt->loc_olg;
+ LASSERT(olg != NULL);
+ LASSERT(olg != LP_POISON);
+
+ idx = ctxt->loc_idx;
+
+ /*
+ * Banlance the ctxt get when calling llog_cleanup()
+ */
+ LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+ LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+ llog_ctxt_put(ctxt);
+
+ /*
+ * Try to free the ctxt.
+ */
+ rc = __llog_ctxt_put(env, ctxt);
+ if (rc)
+ CERROR("Error %d while cleaning up ctxt %p\n",
+ rc, ctxt);
+
+ l_wait_event(olg->olg_waitq,
+ llog_group_ctxt_null(olg, idx), &lwi);
+
+ return rc;
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+ struct obd_llog_group *olg, int index,
+ struct obd_device *disk_obd, struct llog_operations *op)
+{
+ struct llog_ctxt *ctxt;
+ int rc = 0;
+
+ if (index < 0 || index >= LLOG_MAX_CTXTS)
+ return -EINVAL;
+
+ LASSERT(olg != NULL);
+
+ ctxt = llog_new_ctxt(obd);
+ if (!ctxt)
+ return -ENOMEM;
+
+ ctxt->loc_obd = obd;
+ ctxt->loc_olg = olg;
+ ctxt->loc_idx = index;
+ ctxt->loc_logops = op;
+ mutex_init(&ctxt->loc_mutex);
+ ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+ ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+ rc = llog_group_set_ctxt(olg, ctxt, index);
+ if (rc) {
+ llog_ctxt_destroy(ctxt);
+ if (rc == -EEXIST) {
+ ctxt = llog_group_get_ctxt(olg, index);
+ if (ctxt) {
+ /*
+ * mds_lov_update_desc() might call here multiple
+ * times. So if the llog is already set up then
+ * don't to do it again.
+ */
+ CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+ obd->obd_name, index);
+ LASSERT(ctxt->loc_olg == olg);
+ LASSERT(ctxt->loc_obd == obd);
+ LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+ LASSERT(ctxt->loc_logops == op);
+ llog_ctxt_put(ctxt);
+ }
+ rc = 0;
+ }
+ return rc;
+ }
+
+ if (op->lop_setup) {
+ if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+ rc = -EOPNOTSUPP;
+ else
+ rc = op->lop_setup(env, obd, olg, index, disk_obd);
+ }
+
+ if (rc) {
+ CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+ obd->obd_name, index, op->lop_setup, rc);
+ llog_group_clear_ctxt(olg, index);
+ llog_ctxt_destroy(ctxt);
+ } else {
+ CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+ obd->obd_name, index);
+ ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+ int rc = 0;
+
+ if (!ctxt)
+ return 0;
+
+ if (CTXTP(ctxt, sync))
+ rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+ return rc;
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_cookie *cookies, int flags)
+{
+ int rc;
+
+ if (!ctxt) {
+ CERROR("No ctxt\n");
+ return -ENODEV;
+ }
+
+ CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+ rc = CTXTP(ctxt, cancel)(env, ctxt, cookies, flags);
+ return rc;
+}
+EXPORT_SYMBOL(llog_cancel);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+ llog_key_init_generic(&llog_thread_key, NULL);
+ lu_context_key_register(&llog_thread_key);
+ return 0;
+}
+
+void llog_info_fini(void)
+{
+ lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644
index 000000000..a2d5aa105
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
@@ -0,0 +1,415 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include "../include/lustre_log.h"
+
+static void print_llogd_body(struct llogd_body *d)
+{
+ CDEBUG(D_OTHER, "llogd body: %p\n", d);
+ CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+ POSTID(&d->lgd_logid.lgl_oi));
+ CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+ CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+ CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+ CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+ CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+ CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+ CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+ __swab64s(&fid->f_seq);
+ __swab32s(&fid->f_oid);
+ __swab32s(&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+ if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+ __swab64s(&oid->oi.oi_id);
+ __swab64s(&oid->oi.oi_seq);
+ } else {
+ lustre_swab_lu_fid(&oid->oi_fid);
+ }
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+ __swab64s(&log_id->lgl_oi.oi.oi_id);
+ __swab64s(&log_id->lgl_oi.oi.oi_seq);
+ __swab32s(&log_id->lgl_ogen);
+}
+EXPORT_SYMBOL(lustre_swab_llog_id);
+
+void lustre_swab_llogd_body(struct llogd_body *d)
+{
+ print_llogd_body(d);
+ lustre_swab_llog_id(&d->lgd_logid);
+ __swab32s(&d->lgd_ctxt_idx);
+ __swab32s(&d->lgd_llh_flags);
+ __swab32s(&d->lgd_index);
+ __swab32s(&d->lgd_saved_index);
+ __swab32s(&d->lgd_len);
+ __swab64s(&d->lgd_cur_offset);
+ print_llogd_body(d);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body(struct llogd_conn_body *d)
+{
+ __swab64s(&d->lgdc_gen.mnt_cnt);
+ __swab64s(&d->lgdc_gen.conn_cnt);
+ lustre_swab_llog_id(&d->lgdc_logid);
+ __swab32s(&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+ __swab64s(&fid->id);
+ __swab32s(&fid->generation);
+ __swab32s(&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+ __swab64s(&range->lsr_start);
+ __swab64s(&range->lsr_end);
+ __swab32s(&range->lsr_index);
+ __swab32s(&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+ struct llog_rec_tail *tail = NULL;
+
+ __swab32s(&rec->lrh_len);
+ __swab32s(&rec->lrh_index);
+ __swab32s(&rec->lrh_type);
+ __swab32s(&rec->lrh_id);
+
+ switch (rec->lrh_type) {
+ case OST_SZ_REC:
+ {
+ struct llog_size_change_rec *lsc =
+ (struct llog_size_change_rec *)rec;
+
+ lustre_swab_ll_fid(&lsc->lsc_fid);
+ __swab32s(&lsc->lsc_ioepoch);
+ tail = &lsc->lsc_tail;
+ break;
+ }
+ case MDS_UNLINK_REC:
+ {
+ struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+ __swab64s(&lur->lur_oid);
+ __swab32s(&lur->lur_oseq);
+ __swab32s(&lur->lur_count);
+ tail = &lur->lur_tail;
+ break;
+ }
+ case MDS_UNLINK64_REC:
+ {
+ struct llog_unlink64_rec *lur =
+ (struct llog_unlink64_rec *)rec;
+
+ lustre_swab_lu_fid(&lur->lur_fid);
+ __swab32s(&lur->lur_count);
+ tail = &lur->lur_tail;
+ break;
+ }
+ case CHANGELOG_REC:
+ {
+ struct llog_changelog_rec *cr =
+ (struct llog_changelog_rec *)rec;
+
+ __swab16s(&cr->cr.cr_namelen);
+ __swab16s(&cr->cr.cr_flags);
+ __swab32s(&cr->cr.cr_type);
+ __swab64s(&cr->cr.cr_index);
+ __swab64s(&cr->cr.cr_prev);
+ __swab64s(&cr->cr.cr_time);
+ lustre_swab_lu_fid(&cr->cr.cr_tfid);
+ lustre_swab_lu_fid(&cr->cr.cr_pfid);
+ if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+ struct llog_changelog_ext_rec *ext =
+ (struct llog_changelog_ext_rec *)rec;
+
+ lustre_swab_lu_fid(&ext->cr.cr_sfid);
+ lustre_swab_lu_fid(&ext->cr.cr_spfid);
+ tail = &ext->cr_tail;
+ } else {
+ tail = &cr->cr_tail;
+ }
+ tail = (struct llog_rec_tail *)((char *)tail +
+ cr->cr.cr_namelen);
+ break;
+ }
+ case CHANGELOG_USER_REC:
+ {
+ struct llog_changelog_user_rec *cur =
+ (struct llog_changelog_user_rec *)rec;
+
+ __swab32s(&cur->cur_id);
+ __swab64s(&cur->cur_endrec);
+ tail = &cur->cur_tail;
+ break;
+ }
+
+ case HSM_AGENT_REC: {
+ struct llog_agent_req_rec *arr =
+ (struct llog_agent_req_rec *)rec;
+
+ __swab32s(&arr->arr_hai.hai_len);
+ __swab32s(&arr->arr_hai.hai_action);
+ lustre_swab_lu_fid(&arr->arr_hai.hai_fid);
+ lustre_swab_lu_fid(&arr->arr_hai.hai_dfid);
+ __swab64s(&arr->arr_hai.hai_cookie);
+ __swab64s(&arr->arr_hai.hai_extent.offset);
+ __swab64s(&arr->arr_hai.hai_extent.length);
+ __swab64s(&arr->arr_hai.hai_gid);
+ /* no swabing for opaque data */
+ /* hai_data[0]; */
+ break;
+ }
+
+ case MDS_SETATTR64_REC:
+ {
+ struct llog_setattr64_rec *lsr =
+ (struct llog_setattr64_rec *)rec;
+
+ lustre_swab_ost_id(&lsr->lsr_oi);
+ __swab32s(&lsr->lsr_uid);
+ __swab32s(&lsr->lsr_uid_h);
+ __swab32s(&lsr->lsr_gid);
+ __swab32s(&lsr->lsr_gid_h);
+ tail = &lsr->lsr_tail;
+ break;
+ }
+ case OBD_CFG_REC:
+ /* these are swabbed as they are consumed */
+ break;
+ case LLOG_HDR_MAGIC:
+ {
+ struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+ __swab64s(&llh->llh_timestamp);
+ __swab32s(&llh->llh_count);
+ __swab32s(&llh->llh_bitmap_offset);
+ __swab32s(&llh->llh_flags);
+ __swab32s(&llh->llh_size);
+ __swab32s(&llh->llh_cat_idx);
+ tail = &llh->llh_tail;
+ break;
+ }
+ case LLOG_LOGID_MAGIC:
+ {
+ struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+ lustre_swab_llog_id(&lid->lid_id);
+ tail = &lid->lid_tail;
+ break;
+ }
+ case LLOG_GEN_REC:
+ {
+ struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+ __swab64s(&lgr->lgr_gen.mnt_cnt);
+ __swab64s(&lgr->lgr_gen.conn_cnt);
+ tail = &lgr->lgr_tail;
+ break;
+ }
+ case LLOG_PAD_MAGIC:
+ break;
+ default:
+ CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+ rec->lrh_type, rec);
+ }
+
+ if (tail) {
+ __swab32s(&tail->lrt_len);
+ __swab32s(&tail->lrt_index);
+ }
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+ CDEBUG(D_OTHER, "llog header: %p\n", h);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+ CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp);
+ CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+ CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+ CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+ CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+ CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+ CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+ CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr(struct llog_log_hdr *h)
+{
+ print_llog_hdr(h);
+
+ lustre_swab_llog_rec(&h->llh_hdr);
+
+ print_llog_hdr(h);
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+ int i;
+
+ if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+ return;
+ CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+ if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+ for (i = 0; i < lcfg->lcfg_bufcount; i++)
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+ i, lcfg->lcfg_buflens[i]);
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+ int i;
+
+ __swab32s(&lcfg->lcfg_version);
+
+ if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+ CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+ lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+ return;
+ }
+
+ __swab32s(&lcfg->lcfg_command);
+ __swab32s(&lcfg->lcfg_num);
+ __swab32s(&lcfg->lcfg_flags);
+ __swab64s(&lcfg->lcfg_nid);
+ __swab32s(&lcfg->lcfg_bufcount);
+ for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+ __swab32s(&lcfg->lcfg_buflens[i]);
+
+ print_lustre_cfg(lcfg);
+ return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+ __u32 cm_step;
+ __u32 cm_flags;
+ __u32 cm_vers;
+ __u32 padding;
+ __u32 cm_createtime;
+ __u32 cm_canceltime;
+ char cm_tgtname[MTI_NAME_MAXLEN];
+ char cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \
+ (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+ struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker;
+
+ if (swab) {
+ __swab32s(&marker->cm_step);
+ __swab32s(&marker->cm_flags);
+ __swab32s(&marker->cm_vers);
+ }
+ if (size == sizeof(*cm32)) {
+ __u32 createtime, canceltime;
+ /* There was a problem with the original declaration of
+ * cfg_marker on 32-bit systems because it used time_t as
+ * a wire protocol structure, and didn't verify this in
+ * wirecheck. We now have to convert the offsets of the
+ * later fields in order to work on 32- and 64-bit systems.
+ *
+ * Fortunately, the cm_comment field has no functional use
+ * so can be sacrificed when converting the timestamp size.
+ *
+ * Overwrite fields from the end first, so they are not
+ * clobbered, and use memmove() instead of memcpy() because
+ * the source and target buffers overlap. bug 16771 */
+ createtime = cm32->cm_createtime;
+ canceltime = cm32->cm_canceltime;
+ memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+ marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+ memmove(marker->cm_tgtname, cm32->cm_tgtname,
+ sizeof(marker->cm_tgtname));
+ if (swab) {
+ __swab32s(&createtime);
+ __swab32s(&canceltime);
+ }
+ marker->cm_createtime = createtime;
+ marker->cm_canceltime = canceltime;
+ CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n",
+ marker->cm_tgtname);
+ } else if (swab) {
+ __swab64s(&marker->cm_createtime);
+ __swab64s(&marker->cm_canceltime);
+ }
+
+ return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c
new file mode 100644
index 000000000..c49dfe541
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ *
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_counters.c
+ *
+ * Lustre lprocfs counter routines
+ *
+ * Author: Andreas Dilger <andreas.dilger@intel.com>
+ */
+
+#include <linux/module.h>
+#include "../include/lprocfs_status.h"
+#include "../include/obd_support.h"
+
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+ struct lprocfs_counter *percpu_cntr;
+ struct lprocfs_counter_header *header;
+ int smp_id;
+ unsigned long flags = 0;
+
+ if (stats == NULL)
+ return;
+
+ LASSERTF(0 <= idx && idx < stats->ls_num,
+ "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+ /* With per-client stats, statistics are allocated only for
+ * single CPU area, so the smp_id should be 0 always. */
+ smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+ if (smp_id < 0)
+ return;
+
+ header = &stats->ls_cnt_header[idx];
+ percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+ percpu_cntr->lc_count++;
+
+ if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+ /*
+ * lprocfs_counter_add() can be called in interrupt context,
+ * as memory allocation could trigger memory shrinker call
+ * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+ * LU-1727.
+ *
+ * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+ * flag, because it needs accurate counting lest memory leak
+ * check reports error.
+ */
+ if (in_interrupt() &&
+ (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ percpu_cntr->lc_sum_irq += amount;
+ else
+ percpu_cntr->lc_sum += amount;
+
+ if (header->lc_config & LPROCFS_CNTR_STDDEV)
+ percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+ if (amount < percpu_cntr->lc_min)
+ percpu_cntr->lc_min = amount;
+ if (amount > percpu_cntr->lc_max)
+ percpu_cntr->lc_max = amount;
+ }
+ lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+ struct lprocfs_counter *percpu_cntr;
+ struct lprocfs_counter_header *header;
+ int smp_id;
+ unsigned long flags = 0;
+
+ if (stats == NULL)
+ return;
+
+ LASSERTF(0 <= idx && idx < stats->ls_num,
+ "idx %d, ls_num %hu\n", idx, stats->ls_num);
+
+ /* With per-client stats, statistics are allocated only for
+ * single CPU area, so the smp_id should be 0 always. */
+ smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+ if (smp_id < 0)
+ return;
+
+ header = &stats->ls_cnt_header[idx];
+ percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+ if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+ /*
+ * Sometimes we use RCU callbacks to free memory which calls
+ * lprocfs_counter_sub(), and RCU callbacks may execute in
+ * softirq context - right now that's the only case we're in
+ * softirq context here, use separate counter for that.
+ * bz20650.
+ *
+ * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+ * flag, because it needs accurate counting lest memory leak
+ * check reports error.
+ */
+ if (in_interrupt() &&
+ (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ percpu_cntr->lc_sum_irq -= amount;
+ else
+ percpu_cntr->lc_sum -= amount;
+ }
+ lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 000000000..c171c6c6c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
@@ -0,0 +1,2059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre/lustre_idl.h"
+#include <linux/seq_file.h>
+#include <linux/ctype.h>
+
+static const char * const obd_connect_names[] = {
+ "read_only",
+ "lov_index",
+ "unused",
+ "write_grant",
+ "server_lock",
+ "version",
+ "request_portal",
+ "acl",
+ "xattr",
+ "create_on_write",
+ "truncate_lock",
+ "initial_transno",
+ "inode_bit_locks",
+ "join_file(obsolete)",
+ "getattr_by_fid",
+ "no_oh_for_devices",
+ "remote_client",
+ "remote_client_by_force",
+ "max_byte_per_rpc",
+ "64bit_qdata",
+ "mds_capability",
+ "oss_capability",
+ "early_lock_cancel",
+ "som",
+ "adaptive_timeouts",
+ "lru_resize",
+ "mds_mds_connection",
+ "real_conn",
+ "change_qunit_size",
+ "alt_checksum_algorithm",
+ "fid_is_enabled",
+ "version_recovery",
+ "pools",
+ "grant_shrink",
+ "skip_orphan",
+ "large_ea",
+ "full20",
+ "layout_lock",
+ "64bithash",
+ "object_max_bytes",
+ "imp_recov",
+ "jobstats",
+ "umask",
+ "einprogress",
+ "grant_param",
+ "flock_owner",
+ "lvb_type",
+ "nanoseconds_times",
+ "lightweight_conn",
+ "short_io",
+ "pingless",
+ "flock_deadlock",
+ "disp_stripe",
+ "unknown",
+ NULL
+};
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+ __u64 mask = 1;
+ int i, ret = 0;
+
+ for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+ if (flags & mask)
+ ret += snprintf(page + ret, count - ret, "%s%s",
+ ret ? sep : "", obd_connect_names[i]);
+ }
+ if (flags & ~(mask - 1))
+ ret += snprintf(page + ret, count - ret,
+ "%sunknown flags %#llx",
+ ret ? sep : "", flags & ~(mask - 1));
+ return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+ int mult)
+{
+ long decimal_val, frac_val;
+ int prtn;
+
+ if (count < 10)
+ return -EINVAL;
+
+ decimal_val = val / mult;
+ prtn = snprintf(buffer, count, "%ld", decimal_val);
+ frac_val = val % mult;
+
+ if (prtn < (count - 4) && frac_val > 0) {
+ long temp_frac;
+ int i, temp_mult = 1, frac_bits = 0;
+
+ temp_frac = frac_val * 10;
+ buffer[prtn++] = '.';
+ while (frac_bits < 2 && (temp_frac / mult) < 1) {
+ /* only reserved 2 bits fraction */
+ buffer[prtn++] = '0';
+ temp_frac *= 10;
+ frac_bits++;
+ }
+ /*
+ * Need to think these cases :
+ * 1. #echo x.00 > /proc/xxx output result : x
+ * 2. #echo x.0x > /proc/xxx output result : x.0x
+ * 3. #echo x.x0 > /proc/xxx output result : x.x
+ * 4. #echo x.xx > /proc/xxx output result : x.xx
+ * Only reserved 2 bits fraction.
+ */
+ for (i = 0; i < (5 - prtn); i++)
+ temp_mult *= 10;
+
+ frac_bits = min((int)count - prtn, 3 - frac_bits);
+ prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+ frac_val * temp_mult / mult);
+
+ prtn--;
+ while (buffer[prtn] < '1' || buffer[prtn] > '9') {
+ prtn--;
+ if (buffer[prtn] == '.') {
+ prtn--;
+ break;
+ }
+ }
+ prtn++;
+ }
+ buffer[prtn++] = '\n';
+ return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_write_frac_helper(const char __user *buffer, unsigned long count,
+ int *val, int mult)
+{
+ char kernbuf[20], *end, *pbuf;
+
+ if (count > (sizeof(kernbuf) - 1))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+
+ kernbuf[count] = '\0';
+ pbuf = kernbuf;
+ if (*pbuf == '-') {
+ mult = -mult;
+ pbuf++;
+ }
+
+ *val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+ if (pbuf == end)
+ return -EINVAL;
+
+ if (end != NULL && *end == '.') {
+ int temp_val, pow = 1;
+ int i;
+
+ pbuf = end + 1;
+ if (strlen(pbuf) > 5)
+ pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+ temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+ if (pbuf < end) {
+ for (i = 0; i < (end - pbuf); i++)
+ pow *= 10;
+
+ *val += temp_val / pow;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+#if defined (CONFIG_PROC_FS)
+
+static int lprocfs_no_percpu_stats;
+module_param(lprocfs_no_percpu_stats, int, 0644);
+MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+ return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+/* lprocfs API calls */
+
+struct proc_dir_entry *lprocfs_add_simple(struct proc_dir_entry *root,
+ char *name, void *data,
+ struct file_operations *fops)
+{
+ struct proc_dir_entry *proc;
+ umode_t mode = 0;
+
+ if (root == NULL || name == NULL || fops == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (fops->read)
+ mode = 0444;
+ if (fops->write)
+ mode |= 0200;
+ proc = proc_create_data(name, mode, root, fops, data);
+ if (!proc) {
+ CERROR("LprocFS: No memory to create /proc entry %s", name);
+ return ERR_PTR(-ENOMEM);
+ }
+ return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+ struct proc_dir_entry *parent, const char *format, ...)
+{
+ struct proc_dir_entry *entry;
+ char *dest;
+ va_list ap;
+
+ if (parent == NULL || format == NULL)
+ return NULL;
+
+ OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+ if (dest == NULL)
+ return NULL;
+
+ va_start(ap, format);
+ vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+ va_end(ap);
+
+ entry = proc_symlink(name, parent, dest);
+ if (entry == NULL)
+ CERROR("LprocFS: Could not create symbolic link from %s to %s",
+ name, dest);
+
+ OBD_FREE(dest, MAX_STRING_SIZE + 1);
+ return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static struct file_operations lprocfs_generic_fops = { };
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in] The parent proc entry on which new entry will be added.
+ * \param list [in] Array of proc entries to be added.
+ * \param data [in] The argument to be passed when entries read/write routines
+ * are called through /proc file.
+ *
+ * \retval 0 on success
+ * < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+ void *data)
+{
+ if (root == NULL || list == NULL)
+ return -EINVAL;
+
+ while (list->name != NULL) {
+ struct proc_dir_entry *proc;
+ umode_t mode = 0;
+
+ if (list->proc_mode != 0000) {
+ mode = list->proc_mode;
+ } else if (list->fops) {
+ if (list->fops->read)
+ mode = 0444;
+ if (list->fops->write)
+ mode |= 0200;
+ }
+ proc = proc_create_data(list->name, mode, root,
+ list->fops ?: &lprocfs_generic_fops,
+ list->data ?: data);
+ if (proc == NULL)
+ return -ENOMEM;
+ list++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+ proc_remove(*rooth);
+ *rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+ LASSERT(parent != NULL);
+ remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+ struct proc_dir_entry *parent,
+ struct lprocfs_vars *list, void *data)
+{
+ struct proc_dir_entry *entry;
+
+ entry = proc_mkdir(name, parent);
+ if (entry == NULL) {
+ entry = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ if (list != NULL) {
+ int rc = lprocfs_add_vars(entry, list, data);
+ if (rc != 0) {
+ lprocfs_remove(&entry);
+ entry = ERR_PTR(rc);
+ }
+ }
+out:
+ return entry;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(struct seq_file *m, void *data)
+{
+ seq_printf(m, "%u\n", *(unsigned int *)data);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ unsigned *p = data;
+ char dummy[MAX_STRING_SIZE + 1], *end;
+ unsigned long tmp;
+
+ dummy[MAX_STRING_SIZE] = '\0';
+ if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+ return -EFAULT;
+
+ tmp = simple_strtoul(dummy, &end, 0);
+ if (dummy == end)
+ return -EINVAL;
+
+ *p = (unsigned int)tmp;
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(struct seq_file *m, void *data)
+{
+ seq_printf(m, "%llu\n", *(__u64 *)data);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(struct seq_file *m, void *data)
+{
+ atomic_t *atom = data;
+ LASSERT(atom != NULL);
+ seq_printf(m, "%d\n", atomic_read(atom));
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ atomic_t *atm = data;
+ int val = 0;
+ int rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc < 0)
+ return rc;
+
+ if (val <= 0)
+ return -ERANGE;
+
+ atomic_set(atm, val);
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+
+ LASSERT(obd != NULL);
+ seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = data;
+
+ LASSERT(dev != NULL);
+ seq_printf(m, "%s\n", dev->obd_name);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ seq_printf(m, "%u\n", osfs.os_bsize);
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_blocks;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ seq_printf(m, "%llu\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bfree;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ seq_printf(m, "%llu\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bavail;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ seq_printf(m, "%llu\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ seq_printf(m, "%llu\n", osfs.os_files);
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ seq_printf(m, "%llu\n", osfs.os_ffree);
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_import *imp;
+ char *imp_state_name = NULL;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+ imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+ seq_printf(m, "%s\t%s%s\n",
+ obd2cli_tgt(obd), imp_state_name,
+ imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+ LPROCFS_CLIMP_EXIT(obd);
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct ptlrpc_connection *conn;
+
+ LASSERT(obd != NULL);
+
+ LPROCFS_CLIMP_CHECK(obd);
+ conn = obd->u.cli.cl_import->imp_connection;
+ if (conn && obd->u.cli.cl_import)
+ seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+ else
+ seq_puts(m, "<none>\n");
+
+ LPROCFS_CLIMP_EXIT(obd);
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+ struct lprocfs_counter *cnt)
+{
+ unsigned int num_entry;
+ struct lprocfs_counter *percpu_cntr;
+ int i;
+ unsigned long flags = 0;
+
+ memset(cnt, 0, sizeof(*cnt));
+
+ if (stats == NULL) {
+ /* set count to 1 to avoid divide-by-zero errs in callers */
+ cnt->lc_count = 1;
+ return;
+ }
+
+ cnt->lc_min = LC_MIN_INIT;
+
+ num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+ for (i = 0; i < num_entry; i++) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+ cnt->lc_count += percpu_cntr->lc_count;
+ cnt->lc_sum += percpu_cntr->lc_sum;
+ if (percpu_cntr->lc_min < cnt->lc_min)
+ cnt->lc_min = percpu_cntr->lc_min;
+ if (percpu_cntr->lc_max > cnt->lc_max)
+ cnt->lc_max = percpu_cntr->lc_max;
+ cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+ }
+
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag, first) \
+ do { \
+ if (imp->imp_##flag) \
+ seq_printf(m, "%s" #flag, first ? "" : ", "); \
+ } while (0)
+static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+ bool first = true;
+
+ if (imp->imp_obd->obd_no_recov) {
+ seq_printf(m, "no_recov");
+ first = false;
+ }
+
+ flag2str(invalid, first);
+ first = false;
+ flag2str(deactive, first);
+ flag2str(replayable, first);
+ flag2str(pingable, first);
+ return 0;
+}
+#undef flags2str
+
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep)
+{
+ __u64 mask = 1;
+ int i;
+ bool first = true;
+
+ for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+ if (flags & mask) {
+ seq_printf(m, "%s%s",
+ first ? sep : "", obd_connect_names[i]);
+ first = false;
+ }
+ }
+ if (flags & ~(mask - 1))
+ seq_printf(m, "%sunknown flags %#llx",
+ first ? sep : "", flags & ~(mask - 1));
+}
+
+int lprocfs_rd_import(struct seq_file *m, void *data)
+{
+ struct lprocfs_counter ret;
+ struct lprocfs_counter_header *header;
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ struct obd_import_conn *conn;
+ int j;
+ int k;
+ int rw = 0;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+
+ seq_printf(m,
+ "import:\n"
+ " name: %s\n"
+ " target: %s\n"
+ " state: %s\n"
+ " instance: %u\n"
+ " connect_flags: [",
+ obd->obd_name,
+ obd2cli_tgt(obd),
+ ptlrpc_import_state_name(imp->imp_state),
+ imp->imp_connect_data.ocd_instance);
+ obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", ");
+ seq_printf(m,
+ "]\n"
+ " import_flags: [");
+ obd_import_flags2str(imp, m);
+
+ seq_printf(m,
+ "]\n"
+ " connection:\n"
+ " failover_nids: [");
+ spin_lock(&imp->imp_lock);
+ j = 0;
+ list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+ seq_printf(m, "%s%s", j ? ", " : "",
+ libcfs_nid2str(conn->oic_conn->c_peer.nid));
+ j++;
+ }
+ seq_printf(m,
+ "]\n"
+ " current_connection: %s\n"
+ " connection_attempts: %u\n"
+ " generation: %u\n"
+ " in-progress_invalidations: %u\n",
+ imp->imp_connection == NULL ? "<none>" :
+ libcfs_nid2str(imp->imp_connection->c_peer.nid),
+ imp->imp_conn_cnt,
+ imp->imp_generation,
+ atomic_read(&imp->imp_inval_count));
+ spin_unlock(&imp->imp_lock);
+
+ if (obd->obd_svc_stats == NULL)
+ goto out_climp;
+
+ header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+ lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+ if (ret.lc_count != 0) {
+ /* first argument to do_div MUST be __u64 */
+ __u64 sum = ret.lc_sum;
+ do_div(sum, ret.lc_count);
+ ret.lc_sum = sum;
+ } else
+ ret.lc_sum = 0;
+ seq_printf(m,
+ " rpcs:\n"
+ " inflight: %u\n"
+ " unregistering: %u\n"
+ " timeouts: %u\n"
+ " avg_waittime: %llu %s\n",
+ atomic_read(&imp->imp_inflight),
+ atomic_read(&imp->imp_unregistering),
+ atomic_read(&imp->imp_timeouts),
+ ret.lc_sum, header->lc_units);
+
+ k = 0;
+ for (j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+ if (imp->imp_at.iat_portal[j] == 0)
+ break;
+ k = max_t(unsigned int, k,
+ at_get(&imp->imp_at.iat_service_estimate[j]));
+ }
+ seq_printf(m,
+ " service_estimates:\n"
+ " services: %u sec\n"
+ " network: %u sec\n",
+ k,
+ at_get(&imp->imp_at.iat_net_latency));
+
+ seq_printf(m,
+ " transactions:\n"
+ " last_replay: %llu\n"
+ " peer_committed: %llu\n"
+ " last_checked: %llu\n",
+ imp->imp_last_replay_transno,
+ imp->imp_peer_committed_transno,
+ imp->imp_last_transno_checked);
+
+ /* avg data rates */
+ for (rw = 0; rw <= 1; rw++) {
+ lprocfs_stats_collect(obd->obd_svc_stats,
+ PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+ &ret);
+ if (ret.lc_sum > 0 && ret.lc_count > 0) {
+ /* first argument to do_div MUST be __u64 */
+ __u64 sum = ret.lc_sum;
+ do_div(sum, ret.lc_count);
+ ret.lc_sum = sum;
+ seq_printf(m,
+ " %s_data_averages:\n"
+ " bytes_per_rpc: %llu\n",
+ rw ? "write" : "read",
+ ret.lc_sum);
+ }
+ k = (int)ret.lc_sum;
+ j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+ header = &obd->obd_svc_stats->ls_cnt_header[j];
+ lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+ if (ret.lc_sum > 0 && ret.lc_count != 0) {
+ /* first argument to do_div MUST be __u64 */
+ __u64 sum = ret.lc_sum;
+ do_div(sum, ret.lc_count);
+ ret.lc_sum = sum;
+ seq_printf(m,
+ " %s_per_rpc: %llu\n",
+ header->lc_units, ret.lc_sum);
+ j = (int)ret.lc_sum;
+ if (j > 0)
+ seq_printf(m,
+ " MB_per_sec: %u.%.02u\n",
+ k / j, (100 * k / j) % 100);
+ }
+ }
+
+out_climp:
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ int j, k;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+
+ seq_printf(m, "current_state: %s\n",
+ ptlrpc_import_state_name(imp->imp_state));
+ seq_printf(m, "state_history:\n");
+ k = imp->imp_state_hist_idx;
+ for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+ struct import_state_hist *ish =
+ &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+ if (ish->ish_state == 0)
+ continue;
+ seq_printf(m, " - ["CFS_TIME_T", %s]\n",
+ ish->ish_time,
+ ptlrpc_import_state_name(ish->ish_state));
+ }
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+ int i;
+ for (i = 0; i < AT_BINS; i++)
+ seq_printf(m, "%3u ", at->at_hist[i]);
+ seq_printf(m, "\n");
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ unsigned int cur, worst;
+ time_t now, worstt;
+ struct dhms ts;
+ int i;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+
+ now = get_seconds();
+
+ /* Some network health info for kicks */
+ s2dhms(&ts, now - imp->imp_last_reply_time);
+ seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n",
+ "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+ cur = at_get(&imp->imp_at.iat_net_latency);
+ worst = imp->imp_at.iat_net_latency.at_worst_ever;
+ worstt = imp->imp_at.iat_net_latency.at_worst_time;
+ s2dhms(&ts, now - worstt);
+ seq_printf(m, "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ",
+ "network", cur, worst, worstt, DHMS_VARS(&ts));
+ lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ if (imp->imp_at.iat_portal[i] == 0)
+ break;
+ cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+ worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+ worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+ s2dhms(&ts, now - worstt);
+ seq_printf(m, "portal %-2d : cur %3u worst %3u (at %ld, "
+ DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+ cur, worst, worstt, DHMS_VARS(&ts));
+ lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]);
+ }
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ __u64 flags;
+
+ LPROCFS_CLIMP_CHECK(obd);
+ flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+ seq_printf(m, "flags=%#llx\n", flags);
+ obd_connect_seq_flags2str(m, flags, "\n");
+ seq_printf(m, "\n");
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+
+ LASSERT(obd != NULL);
+ seq_printf(m, "%u\n", obd->obd_num_exports);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{
+ struct obd_type *class = (struct obd_type *) data;
+
+ LASSERT(class != NULL);
+ seq_printf(m, "%d\n", class->typ_refcnt);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+ int rc = 0;
+
+ LASSERT(obd != NULL);
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ LASSERT(obd->obd_type->typ_procroot != NULL);
+
+ obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+ obd->obd_type->typ_procroot,
+ list, obd);
+ if (IS_ERR(obd->obd_proc_entry)) {
+ rc = PTR_ERR(obd->obd_proc_entry);
+ CERROR("error %d setting up lprocfs for %s\n",
+ rc, obd->obd_name);
+ obd->obd_proc_entry = NULL;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+ if (!obd)
+ return -EINVAL;
+ if (obd->obd_proc_exports_entry) {
+ /* Should be no exports left */
+ lprocfs_remove(&obd->obd_proc_exports_entry);
+ obd->obd_proc_exports_entry = NULL;
+ }
+ if (obd->obd_proc_entry) {
+ lprocfs_remove(&obd->obd_proc_entry);
+ obd->obd_proc_entry = NULL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+ CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+ client_stat->nid_proc, client_stat->nid_stats);
+
+ LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+ "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+ atomic_read(&client_stat->nid_exp_ref_count));
+
+ if (client_stat->nid_proc)
+ lprocfs_remove(&client_stat->nid_proc);
+
+ if (client_stat->nid_stats)
+ lprocfs_free_stats(&client_stat->nid_stats);
+
+ if (client_stat->nid_ldlm_stats)
+ lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+ OBD_FREE_PTR(client_stat);
+ return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+ struct cfs_hash *hash = obd->obd_nid_stats_hash;
+ struct nid_stat *stat;
+
+ /* we need extra list - because hash_exit called to early */
+ /* not need locking because all clients is died */
+ while (!list_empty(&obd->obd_nid_stats)) {
+ stat = list_entry(obd->obd_nid_stats.next,
+ struct nid_stat, nid_list);
+ list_del_init(&stat->nid_list);
+ cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+ lprocfs_free_client_stats(stat);
+ }
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+ struct lprocfs_counter *cntr;
+ unsigned int percpusize;
+ int rc = -ENOMEM;
+ unsigned long flags = 0;
+ int i;
+
+ LASSERT(stats->ls_percpu[cpuid] == NULL);
+ LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+ percpusize = lprocfs_stats_counter_size(stats);
+ LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+ if (stats->ls_percpu[cpuid] != NULL) {
+ rc = 0;
+ if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ spin_lock_irqsave(&stats->ls_lock, flags);
+ else
+ spin_lock(&stats->ls_lock);
+ if (stats->ls_biggest_alloc_num <= cpuid)
+ stats->ls_biggest_alloc_num = cpuid + 1;
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ spin_unlock_irqrestore(&stats->ls_lock, flags);
+ else
+ spin_unlock(&stats->ls_lock);
+ }
+ /* initialize the ls_percpu[cpuid] non-zero counter */
+ for (i = 0; i < stats->ls_num; ++i) {
+ cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+ cntr->lc_min = LC_MIN_INIT;
+ }
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_stats_alloc_one);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+ enum lprocfs_stats_flags flags)
+{
+ struct lprocfs_stats *stats;
+ unsigned int num_entry;
+ unsigned int percpusize = 0;
+ int i;
+
+ if (num == 0)
+ return NULL;
+
+ if (lprocfs_no_percpu_stats != 0)
+ flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+ if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+ num_entry = 1;
+ else
+ num_entry = num_possible_cpus();
+
+ /* alloc percpu pointers for all possible cpu slots */
+ LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+ if (stats == NULL)
+ return NULL;
+
+ stats->ls_num = num;
+ stats->ls_flags = flags;
+ spin_lock_init(&stats->ls_lock);
+
+ /* alloc num of counter headers */
+ LIBCFS_ALLOC(stats->ls_cnt_header,
+ stats->ls_num * sizeof(struct lprocfs_counter_header));
+ if (stats->ls_cnt_header == NULL)
+ goto fail;
+
+ if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+ /* contains only one set counters */
+ percpusize = lprocfs_stats_counter_size(stats);
+ LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+ if (stats->ls_percpu[0] == NULL)
+ goto fail;
+ stats->ls_biggest_alloc_num = 1;
+ } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+ /* alloc all percpu data, currently only obd_memory use this */
+ for (i = 0; i < num_entry; ++i)
+ if (lprocfs_stats_alloc_one(stats, i) < 0)
+ goto fail;
+ }
+
+ return stats;
+
+fail:
+ lprocfs_free_stats(&stats);
+ return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+ struct lprocfs_stats *stats = *statsh;
+ unsigned int num_entry;
+ unsigned int percpusize;
+ unsigned int i;
+
+ if (stats == NULL || stats->ls_num == 0)
+ return;
+ *statsh = NULL;
+
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+ num_entry = 1;
+ else
+ num_entry = num_possible_cpus();
+
+ percpusize = lprocfs_stats_counter_size(stats);
+ for (i = 0; i < num_entry; i++)
+ if (stats->ls_percpu[i] != NULL)
+ LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+ if (stats->ls_cnt_header != NULL)
+ LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+ sizeof(struct lprocfs_counter_header));
+ LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+ struct lprocfs_counter *percpu_cntr;
+ int i;
+ int j;
+ unsigned int num_entry;
+ unsigned long flags = 0;
+
+ num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+ for (i = 0; i < num_entry; i++) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ for (j = 0; j < stats->ls_num; j++) {
+ percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+ percpu_cntr->lc_count = 0;
+ percpu_cntr->lc_min = LC_MIN_INIT;
+ percpu_cntr->lc_max = 0;
+ percpu_cntr->lc_sumsquare = 0;
+ percpu_cntr->lc_sum = 0;
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ percpu_cntr->lc_sum_irq = 0;
+ }
+ }
+
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct lprocfs_stats *stats = seq->private;
+
+ lprocfs_clear_stats(stats);
+
+ return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+ struct lprocfs_stats *stats = p->private;
+
+ return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return lprocfs_stats_seq_start(p, pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+ struct lprocfs_stats *stats = p->private;
+ struct lprocfs_counter_header *hdr;
+ struct lprocfs_counter ctr;
+ int idx = *(loff_t *)v;
+
+ if (idx == 0) {
+ struct timeval now;
+ do_gettimeofday(&now);
+ seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+ "snapshot_time",
+ now.tv_sec, (unsigned long)now.tv_usec);
+ }
+
+ hdr = &stats->ls_cnt_header[idx];
+ lprocfs_stats_collect(stats, idx, &ctr);
+
+ if (ctr.lc_count != 0) {
+ seq_printf(p, "%-25s %lld samples [%s]",
+ hdr->lc_name, ctr.lc_count, hdr->lc_units);
+
+ if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) &&
+ (ctr.lc_count > 0)) {
+ seq_printf(p, " %lld %lld %lld",
+ ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+ if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+ seq_printf(p, " %lld", ctr.lc_sumsquare);
+ }
+ seq_putc(p, '\n');
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lprocfs_stats_seq_sops = {
+ .start = lprocfs_stats_seq_start,
+ .stop = lprocfs_stats_seq_stop,
+ .next = lprocfs_stats_seq_next,
+ .show = lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc;
+
+ rc = seq_open(file, &lprocfs_stats_seq_sops);
+ if (rc)
+ return rc;
+ seq = file->private_data;
+ seq->private = PDE_DATA(inode);
+ return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = lprocfs_stats_seq_open,
+ .read = seq_read,
+ .write = lprocfs_stats_seq_write,
+ .llseek = seq_lseek,
+ .release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+ struct lprocfs_stats *stats)
+{
+ struct proc_dir_entry *entry;
+ LASSERT(root != NULL);
+
+ entry = proc_create_data(name, 0644, root,
+ &lprocfs_stats_seq_fops, stats);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+ unsigned conf, const char *name, const char *units)
+{
+ struct lprocfs_counter_header *header;
+ struct lprocfs_counter *percpu_cntr;
+ unsigned long flags = 0;
+ unsigned int i;
+ unsigned int num_cpu;
+
+ LASSERT(stats != NULL);
+
+ header = &stats->ls_cnt_header[index];
+ LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+ index, name, units);
+
+ header->lc_config = conf;
+ header->lc_name = name;
+ header->lc_units = units;
+
+ num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+ for (i = 0; i < num_cpu; ++i) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+ percpu_cntr->lc_count = 0;
+ percpu_cntr->lc_min = LC_MIN_INIT;
+ percpu_cntr->lc_max = 0;
+ percpu_cntr->lc_sumsquare = 0;
+ percpu_cntr->lc_sum = 0;
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ percpu_cntr->lc_sum_irq = 0;
+ }
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op) \
+do { \
+ unsigned int coffset = base + OBD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < stats->ls_num); \
+ lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+ struct lprocfs_stats *stats;
+ unsigned int num_stats;
+ int rc, i;
+
+ LASSERT(obd->obd_stats == NULL);
+ LASSERT(obd->obd_proc_entry != NULL);
+ LASSERT(obd->obd_cntr_base == 0);
+
+ num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+ num_private_stats - 1 /* o_owner */;
+ stats = lprocfs_alloc_stats(num_stats, 0);
+ if (stats == NULL)
+ return -ENOMEM;
+
+ lprocfs_init_ops_stats(num_private_stats, stats);
+
+ for (i = num_private_stats; i < num_stats; i++) {
+ /* If this LBUGs, it is likely that an obd
+ * operation was added to struct obd_ops in
+ * <obd.h>, and that the corresponding line item
+ * LPROCFS_OBD_OP_INIT(.., .., opname)
+ * is missing from the list above. */
+ LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+ "Missing obd_stat initializer obd_op operation at offset %d.\n",
+ i - num_private_stats);
+ }
+ rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+ if (rc < 0) {
+ lprocfs_free_stats(&stats);
+ } else {
+ obd->obd_stats = stats;
+ obd->obd_cntr_base = num_private_stats;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+ if (obd->obd_stats)
+ lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op) \
+do { \
+ unsigned int coffset = base + MD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < stats->ls_num); \
+ lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+ unsigned num_private_stats)
+{
+ struct lprocfs_stats *stats;
+ unsigned int num_stats;
+ int rc, i;
+
+ LASSERT(obd->md_stats == NULL);
+ LASSERT(obd->obd_proc_entry != NULL);
+ LASSERT(obd->md_cntr_base == 0);
+
+ num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+ num_private_stats;
+ stats = lprocfs_alloc_stats(num_stats, 0);
+ if (stats == NULL)
+ return -ENOMEM;
+
+ lprocfs_init_mps_stats(num_private_stats, stats);
+
+ for (i = num_private_stats; i < num_stats; i++) {
+ if (stats->ls_cnt_header[i].lc_name == NULL) {
+ CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n",
+ i - num_private_stats);
+ LBUG();
+ }
+ }
+ rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+ if (rc < 0) {
+ lprocfs_free_stats(&stats);
+ } else {
+ obd->md_stats = stats;
+ obd->md_cntr_base = num_private_stats;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+ struct lprocfs_stats *stats = obd->md_stats;
+
+ if (stats != NULL) {
+ obd->md_stats = NULL;
+ obd->md_cntr_base = 0;
+ lprocfs_free_stats(&stats);
+ }
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_ENQUEUE - LDLM_FIRST_OPC,
+ 0, "ldlm_enqueue", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_CONVERT - LDLM_FIRST_OPC,
+ 0, "ldlm_convert", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_CANCEL - LDLM_FIRST_OPC,
+ 0, "ldlm_cancel", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+ 0, "ldlm_bl_callback", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+ 0, "ldlm_cp_callback", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+ 0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_print_uuid(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *data)
+
+{
+ struct obd_export *exp = cfs_hash_object(hs, hnode);
+ struct seq_file *m = (struct seq_file *)data;
+
+ if (exp->exp_nid_stats)
+ seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+
+ return 0;
+}
+
+static int
+lproc_exp_uuid_seq_show(struct seq_file *m, void *unused)
+{
+ struct nid_stat *stats = (struct nid_stat *)m->private;
+ struct obd_device *obd = stats->nid_obd;
+
+ cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+ lprocfs_exp_print_uuid, m);
+ return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_uuid);
+
+struct exp_hash_cb_data {
+ struct seq_file *m;
+ bool first;
+};
+
+int lprocfs_exp_print_hash(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *cb_data)
+
+{
+ struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data;
+ struct obd_export *exp = cfs_hash_object(hs, hnode);
+
+ if (exp->exp_lock_hash != NULL) {
+ if (data->first) {
+ cfs_hash_debug_header(data->m);
+ data->first = false;
+ }
+ cfs_hash_debug_str(hs, data->m);
+ }
+
+ return 0;
+}
+
+static int
+lproc_exp_hash_seq_show(struct seq_file *m, void *unused)
+{
+ struct nid_stat *stats = (struct nid_stat *)m->private;
+ struct obd_device *obd = stats->nid_obd;
+ struct exp_hash_cb_data cb_data = {
+ .m = m,
+ .first = true
+ };
+
+ cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+ lprocfs_exp_print_hash, &cb_data);
+ return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_hash);
+
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{
+ seq_printf(m, "%s\n",
+ "Write into this file to clear all nid stats and stale nid entries");
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+ struct nid_stat *stat = obj;
+
+ CDEBUG(D_INFO, "refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+ if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+ /* object has only hash references. */
+ spin_lock(&stat->nid_obd->obd_nid_lock);
+ list_move(&stat->nid_list, data);
+ spin_unlock(&stat->nid_obd->obd_nid_lock);
+ return 1;
+ }
+ /* we has reference to object - only clear data*/
+ if (stat->nid_stats)
+ lprocfs_clear_stats(stat->nid_stats);
+
+ return 0;
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct nid_stat *client_stat;
+ LIST_HEAD(free_list);
+
+ cfs_hash_cond_del(obd->obd_nid_stats_hash,
+ lprocfs_nid_stats_clear_write_cb, &free_list);
+
+ while (!list_empty(&free_list)) {
+ client_stat = list_entry(free_list.next, struct nid_stat,
+ nid_list);
+ list_del_init(&client_stat->nid_list);
+ lprocfs_free_client_stats(client_stat);
+ }
+
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+ struct nid_stat *new_stat, *old_stat;
+ struct obd_device *obd = NULL;
+ struct proc_dir_entry *entry;
+ char *buffer = NULL;
+ int rc = 0;
+
+ *newnid = 0;
+
+ if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+ !exp->exp_obd->obd_nid_stats_hash)
+ return -EINVAL;
+
+ /* not test against zero because eric say:
+ * You may only test nid against another nid, or LNET_NID_ANY.
+ * Anything else is nonsense.*/
+ if (!nid || *nid == LNET_NID_ANY)
+ return 0;
+
+ obd = exp->exp_obd;
+
+ CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+ OBD_ALLOC_PTR(new_stat);
+ if (new_stat == NULL)
+ return -ENOMEM;
+
+ new_stat->nid = *nid;
+ new_stat->nid_obd = exp->exp_obd;
+ /* we need set default refcount to 1 to balance obd_disconnect */
+ atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+ old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+ nid, &new_stat->nid_hash);
+ CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+ old_stat, libcfs_nid2str(*nid),
+ atomic_read(&new_stat->nid_exp_ref_count));
+
+ /* We need to release old stats because lprocfs_exp_cleanup() hasn't
+ * been and will never be called. */
+ if (exp->exp_nid_stats) {
+ nidstat_putref(exp->exp_nid_stats);
+ exp->exp_nid_stats = NULL;
+ }
+
+ /* Return -EALREADY here so that we know that the /proc
+ * entry already has been created */
+ if (old_stat != new_stat) {
+ exp->exp_nid_stats = old_stat;
+ rc = -EALREADY;
+ goto destroy_new;
+ }
+ /* not found - create */
+ OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+ if (buffer == NULL) {
+ rc = -ENOMEM;
+ goto destroy_new;
+ }
+
+ memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+ new_stat->nid_proc = lprocfs_register(buffer,
+ obd->obd_proc_exports_entry,
+ NULL, NULL);
+ OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+ if (IS_ERR(new_stat->nid_proc)) {
+ CERROR("Error making export directory for nid %s\n",
+ libcfs_nid2str(*nid));
+ rc = PTR_ERR(new_stat->nid_proc);
+ new_stat->nid_proc = NULL;
+ goto destroy_new_ns;
+ }
+
+ entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+ new_stat, &lproc_exp_uuid_fops);
+ if (IS_ERR(entry)) {
+ CWARN("Error adding the NID stats file\n");
+ rc = PTR_ERR(entry);
+ goto destroy_new_ns;
+ }
+
+ entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+ new_stat, &lproc_exp_hash_fops);
+ if (IS_ERR(entry)) {
+ CWARN("Error adding the hash file\n");
+ rc = PTR_ERR(entry);
+ goto destroy_new_ns;
+ }
+
+ exp->exp_nid_stats = new_stat;
+ *newnid = 1;
+ /* protect competitive add to list, not need locking on destroy */
+ spin_lock(&obd->obd_nid_lock);
+ list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+ spin_unlock(&obd->obd_nid_lock);
+
+ return rc;
+
+destroy_new_ns:
+ if (new_stat->nid_proc != NULL)
+ lprocfs_remove(&new_stat->nid_proc);
+ cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+ nidstat_putref(new_stat);
+ OBD_FREE_PTR(new_stat);
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+ struct nid_stat *stat = exp->exp_nid_stats;
+
+ if (!stat || !exp->exp_obd)
+ return 0;
+
+ nidstat_putref(exp->exp_nid_stats);
+ exp->exp_nid_stats = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+ struct lprocfs_counter_header *header,
+ enum lprocfs_stats_flags flags,
+ enum lprocfs_fields_flags field)
+{
+ __s64 ret = 0;
+
+ if (lc == NULL || header == NULL)
+ return 0;
+
+ switch (field) {
+ case LPROCFS_FIELDS_FLAGS_CONFIG:
+ ret = header->lc_config;
+ break;
+ case LPROCFS_FIELDS_FLAGS_SUM:
+ ret = lc->lc_sum;
+ if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ ret += lc->lc_sum_irq;
+ break;
+ case LPROCFS_FIELDS_FLAGS_MIN:
+ ret = lc->lc_min;
+ break;
+ case LPROCFS_FIELDS_FLAGS_MAX:
+ ret = lc->lc_max;
+ break;
+ case LPROCFS_FIELDS_FLAGS_AVG:
+ ret = (lc->lc_max - lc->lc_min) / 2;
+ break;
+ case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+ ret = lc->lc_sumsquare;
+ break;
+ case LPROCFS_FIELDS_FLAGS_COUNT:
+ ret = lc->lc_count;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+
+int lprocfs_write_helper(const char __user *buffer, unsigned long count,
+ int *val)
+{
+ return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+ long decimal_val, frac_val;
+
+ decimal_val = val / mult;
+ seq_printf(m, "%ld", decimal_val);
+ frac_val = val % mult;
+
+ if (frac_val > 0) {
+ frac_val *= 100;
+ frac_val /= mult;
+ }
+ if (frac_val > 0) {
+ /* Three cases: x0, xx, 0x */
+ if ((frac_val % 10) != 0)
+ seq_printf(m, ".%ld", frac_val);
+ else
+ seq_printf(m, ".%ld", frac_val / 10);
+ }
+
+ seq_printf(m, "\n");
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char __user *buffer, unsigned long count,
+ __u64 *val)
+{
+ return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+ __u64 *val, int mult)
+{
+ char kernbuf[22], *end, *pbuf;
+ __u64 whole, frac = 0, units;
+ unsigned frac_d = 1;
+ int sign = 1;
+
+ if (count > (sizeof(kernbuf) - 1))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+
+ kernbuf[count] = '\0';
+ pbuf = kernbuf;
+ if (*pbuf == '-') {
+ sign = -1;
+ pbuf++;
+ }
+
+ whole = simple_strtoull(pbuf, &end, 10);
+ if (pbuf == end)
+ return -EINVAL;
+
+ if (*end == '.') {
+ int i;
+ pbuf = end + 1;
+
+ /* need to limit frac_d to a __u32 */
+ if (strlen(pbuf) > 10)
+ pbuf[10] = '\0';
+
+ frac = simple_strtoull(pbuf, &end, 10);
+ /* count decimal places */
+ for (i = 0; i < (end - pbuf); i++)
+ frac_d *= 10;
+ }
+
+ units = 1;
+ switch (tolower(*end)) {
+ case 'p':
+ units <<= 10;
+ case 't':
+ units <<= 10;
+ case 'g':
+ units <<= 10;
+ case 'm':
+ units <<= 10;
+ case 'k':
+ units <<= 10;
+ }
+ /* Specified units override the multiplier */
+ if (units > 1)
+ mult = units;
+
+ frac *= mult;
+ do_div(frac, frac_d);
+ *val = sign * (whole * mult + frac);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+ size_t l2;
+
+ l2 = strlen(s2);
+ if (!l2)
+ return (char *)s1;
+ while (len >= l2) {
+ len--;
+ if (!memcmp(s1, s2, l2))
+ return (char *)s1;
+ s1++;
+ }
+ return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+ size_t *count)
+{
+ char *val;
+ size_t buflen = *count;
+
+ /* there is no strnstr() in rhel5 and ubuntu kernels */
+ val = lprocfs_strnstr(buffer, name, buflen);
+ if (val == NULL)
+ return (char *)buffer;
+
+ val += strlen(name); /* skip prefix */
+ while (val < buffer + buflen && isspace(*val)) /* skip separator */
+ val++;
+
+ *count = 0;
+ while (val < buffer + buflen && isalnum(*val)) {
+ ++*count;
+ ++val;
+ }
+
+ return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(struct proc_dir_entry *parent,
+ const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data)
+{
+ struct proc_dir_entry *entry;
+
+ /* Disallow secretly (un)writable entries. */
+ LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+ entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+ if (entry == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+ const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data)
+{
+ return lprocfs_seq_create(dev->obd_proc_entry, name,
+ mode, seq_fops, data);
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+ if (value >= OBD_HIST_MAX)
+ value = OBD_HIST_MAX - 1;
+
+ spin_lock(&oh->oh_lock);
+ oh->oh_buckets[value]++;
+ spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+ unsigned int val;
+
+ for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+ ;
+
+ lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+ unsigned long ret = 0;
+ int i;
+
+ for (i = 0; i < OBD_HIST_MAX; i++)
+ ret += oh->oh_buckets[i];
+ return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+ spin_lock(&oh->oh_lock);
+ memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+ spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = data;
+ struct client_obd *cli = &dev->u.cli;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644
index 000000000..20c077995
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -0,0 +1,2192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include "../../include/linux/libcfs/libcfs_hash.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_fid.h"
+#include "../include/lu_object.h"
+#include "../include/lu_ref.h"
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_object_header *top;
+ struct lu_site *site;
+ struct lu_object *orig;
+ struct cfs_hash_bd bd;
+ const struct lu_fid *fid;
+
+ top = o->lo_header;
+ site = o->lo_dev->ld_site;
+ orig = o;
+
+ /*
+ * till we have full fids-on-OST implemented anonymous objects
+ * are possible in OSP. such an object isn't listed in the site
+ * so we should not remove it from the site.
+ */
+ fid = lu_object_fid(o);
+ if (fid_is_zero(fid)) {
+ LASSERT(top->loh_hash.next == NULL
+ && top->loh_hash.pprev == NULL);
+ LASSERT(list_empty(&top->loh_lru));
+ if (!atomic_dec_and_test(&top->loh_ref))
+ return;
+ list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+ if (o->lo_ops->loo_object_release != NULL)
+ o->lo_ops->loo_object_release(env, o);
+ }
+ lu_object_free(env, orig);
+ return;
+ }
+
+ cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+ bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+ if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+ if (lu_object_is_dying(top)) {
+
+ /*
+ * somebody may be waiting for this, currently only
+ * used for cl_object, see cl_object_put_last().
+ */
+ wake_up_all(&bkt->lsb_marche_funebre);
+ }
+ return;
+ }
+
+ LASSERT(bkt->lsb_busy > 0);
+ bkt->lsb_busy--;
+ /*
+ * When last reference is released, iterate over object
+ * layers, and notify them that object is no longer busy.
+ */
+ list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+ if (o->lo_ops->loo_object_release != NULL)
+ o->lo_ops->loo_object_release(env, o);
+ }
+
+ if (!lu_object_is_dying(top)) {
+ LASSERT(list_empty(&top->loh_lru));
+ list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+ cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+ return;
+ }
+
+ /*
+ * If object is dying (will not be cached), removed it
+ * from hash table and LRU.
+ *
+ * This is done with hash table and LRU lists locked. As the only
+ * way to acquire first reference to previously unreferenced
+ * object is through hash-table lookup (lu_object_find()),
+ * or LRU scanning (lu_site_purge()), that are done under hash-table
+ * and LRU lock, no race with concurrent object lookup is possible
+ * and we can safely destroy object below.
+ */
+ if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+ cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+ cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+ /*
+ * Object was already removed from hash and lru above, can
+ * kill it.
+ */
+ lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+ return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+ struct lu_object_header *top;
+
+ top = o->lo_header;
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+ if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+ struct cfs_hash *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+ struct cfs_hash_bd bd;
+
+ cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+ list_del_init(&top->loh_lru);
+ cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+ cfs_hash_bd_unlock(obj_hash, &bd, 1);
+ }
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_object *scan;
+ struct lu_object *top;
+ struct list_head *layers;
+ unsigned int init_mask = 0;
+ unsigned int init_flag;
+ int clean;
+ int result;
+
+ /*
+ * Create top-level object slice. This will also create
+ * lu_object_header.
+ */
+ top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+ if (top == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (IS_ERR(top))
+ return top;
+ /*
+ * This is the only place where object fid is assigned. It's constant
+ * after this point.
+ */
+ top->lo_header->loh_fid = *f;
+ layers = &top->lo_header->loh_layers;
+
+ do {
+ /*
+ * Call ->loo_object_init() repeatedly, until no more new
+ * object slices are created.
+ */
+ clean = 1;
+ init_flag = 1;
+ list_for_each_entry(scan, layers, lo_linkage) {
+ if (init_mask & init_flag)
+ goto next;
+ clean = 0;
+ scan->lo_header = top->lo_header;
+ result = scan->lo_ops->loo_object_init(env, scan, conf);
+ if (result != 0) {
+ lu_object_free(env, top);
+ return ERR_PTR(result);
+ }
+ init_mask |= init_flag;
+next:
+ init_flag <<= 1;
+ }
+ } while (!clean);
+
+ list_for_each_entry_reverse(scan, layers, lo_linkage) {
+ if (scan->lo_ops->loo_object_start != NULL) {
+ result = scan->lo_ops->loo_object_start(env, scan);
+ if (result != 0) {
+ lu_object_free(env, top);
+ return ERR_PTR(result);
+ }
+ }
+ }
+
+ lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+ return top;
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_site *site;
+ struct lu_object *scan;
+ struct list_head *layers;
+ struct list_head splice;
+
+ site = o->lo_dev->ld_site;
+ layers = &o->lo_header->loh_layers;
+ bkt = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+ /*
+ * First call ->loo_object_delete() method to release all resources.
+ */
+ list_for_each_entry_reverse(scan, layers, lo_linkage) {
+ if (scan->lo_ops->loo_object_delete != NULL)
+ scan->lo_ops->loo_object_delete(env, scan);
+ }
+
+ /*
+ * Then, splice object layers into stand-alone list, and call
+ * ->loo_object_free() on all layers to free memory. Splice is
+ * necessary, because lu_object_header is freed together with the
+ * top-level slice.
+ */
+ INIT_LIST_HEAD(&splice);
+ list_splice_init(layers, &splice);
+ while (!list_empty(&splice)) {
+ /*
+ * Free layers in bottom-to-top order, so that object header
+ * lives as long as possible and ->loo_object_free() methods
+ * can look at its contents.
+ */
+ o = container_of0(splice.prev, struct lu_object, lo_linkage);
+ list_del_init(&o->lo_linkage);
+ LASSERT(o->lo_ops->loo_object_free != NULL);
+ o->lo_ops->loo_object_free(env, o);
+ }
+
+ if (waitqueue_active(&bkt->lsb_marche_funebre))
+ wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+ struct lu_object_header *h;
+ struct lu_object_header *temp;
+ struct lu_site_bkt_data *bkt;
+ struct cfs_hash_bd bd;
+ struct cfs_hash_bd bd2;
+ struct list_head dispose;
+ int did_sth;
+ int start;
+ int count;
+ int bnr;
+ int i;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+ return 0;
+
+ INIT_LIST_HEAD(&dispose);
+ /*
+ * Under LRU list lock, scan LRU list and move unreferenced objects to
+ * the dispose list, removing them from LRU and hash table.
+ */
+ start = s->ls_purge_start;
+ bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+ did_sth = 0;
+ cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+ if (i < start)
+ continue;
+ count = bnr;
+ cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+ bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+ list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+ LASSERT(atomic_read(&h->loh_ref) == 0);
+
+ cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+ LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+ cfs_hash_bd_del_locked(s->ls_obj_hash,
+ &bd2, &h->loh_hash);
+ list_move(&h->loh_lru, &dispose);
+ if (did_sth == 0)
+ did_sth = 1;
+
+ if (nr != ~0 && --nr == 0)
+ break;
+
+ if (count > 0 && --count == 0)
+ break;
+
+ }
+ cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+ cond_resched();
+ /*
+ * Free everything on the dispose list. This is safe against
+ * races due to the reasons described in lu_object_put().
+ */
+ while (!list_empty(&dispose)) {
+ h = container_of0(dispose.next,
+ struct lu_object_header, loh_lru);
+ list_del_init(&h->loh_lru);
+ lu_object_free(env, lu_object_top(h));
+ lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+ }
+
+ if (nr == 0)
+ break;
+ }
+
+ if (nr != 0 && did_sth && start != 0) {
+ start = 0; /* restart from the first bucket */
+ goto again;
+ }
+ /* race on s->ls_purge_start, but nobody cares */
+ s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+ return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+ /**
+ * Maximal line size.
+ *
+ * XXX overflow is not handled correctly.
+ */
+ LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+ /**
+ * Temporary buffer.
+ */
+ char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+ .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+ LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL,
+ .lct_init = lu_global_key_init,
+ .lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+ void *cookie, const char *format, ...)
+{
+ struct libcfs_debug_msg_data *msgdata = cookie;
+ struct lu_cdebug_data *key;
+ int used;
+ int complete;
+ va_list args;
+
+ va_start(args, format);
+
+ key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+ LASSERT(key != NULL);
+
+ used = strlen(key->lck_area);
+ complete = format[strlen(format) - 1] == '\n';
+ /*
+ * Append new chunk to the buffer.
+ */
+ vsnprintf(key->lck_area + used,
+ ARRAY_SIZE(key->lck_area) - used, format, args);
+ if (complete) {
+ if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+ libcfs_debug_msg(msgdata, "%s", key->lck_area);
+ key->lck_area[0] = 0;
+ }
+ va_end(args);
+ return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct lu_object_header *hdr)
+{
+ (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+ hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+ PFID(&hdr->loh_fid),
+ hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+ list_empty((struct list_head *)&hdr->loh_lru) ? \
+ "" : " lru",
+ hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct lu_object *o)
+{
+ static const char ruler[] = "........................................";
+ struct lu_object_header *top;
+ int depth = 4;
+
+ top = o->lo_header;
+ lu_object_header_print(env, cookie, printer, top);
+ (*printer)(env, cookie, "{\n");
+
+ list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+ /*
+ * print `.' \a depth times followed by type name and address
+ */
+ (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+ o->lo_dev->ld_type->ldt_name, o);
+
+ if (o->lo_ops->loo_object_print != NULL)
+ (*o->lo_ops->loo_object_print)(env, cookie, printer, o);
+
+ (*printer)(env, cookie, "\n");
+ }
+
+ (*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+ struct lu_object_header *top;
+
+ top = o->lo_header;
+ list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+ if (o->lo_ops->loo_object_invariant != NULL &&
+ !o->lo_ops->loo_object_invariant(o))
+ return 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+ struct cfs_hash_bd *bd,
+ const struct lu_fid *f,
+ wait_queue_t *waiter,
+ __u64 *version)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_object_header *h;
+ struct hlist_node *hnode;
+ __u64 ver = cfs_hash_bd_version_get(bd);
+
+ if (*version == ver)
+ return ERR_PTR(-ENOENT);
+
+ *version = ver;
+ bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+ /* cfs_hash_bd_peek_locked is a somehow "internal" function
+ * of cfs_hash, it doesn't add refcount on object. */
+ hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+ if (hnode == NULL) {
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+ return ERR_PTR(-ENOENT);
+ }
+
+ h = container_of0(hnode, struct lu_object_header, loh_hash);
+ if (likely(!lu_object_is_dying(h))) {
+ cfs_hash_get(s->ls_obj_hash, hnode);
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+ list_del_init(&h->loh_lru);
+ return lu_object_top(h);
+ }
+
+ /*
+ * Lookup found an object being destroyed this object cannot be
+ * returned (to assure that references to dying objects are eventually
+ * drained), and moreover, lookup has to wait until object is freed.
+ */
+
+ init_waitqueue_entry(waiter, current);
+ add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+ struct lu_device *dev, const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_object *o;
+ struct cfs_hash *hs;
+ struct cfs_hash_bd bd;
+ struct lu_site_bkt_data *bkt;
+
+ o = lu_object_alloc(env, dev, f, conf);
+ if (unlikely(IS_ERR(o)))
+ return o;
+
+ hs = dev->ld_site->ls_obj_hash;
+ cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+ bkt->lsb_busy++;
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf,
+ wait_queue_t *waiter)
+{
+ struct lu_object *o;
+ struct lu_object *shadow;
+ struct lu_site *s;
+ struct cfs_hash *hs;
+ struct cfs_hash_bd bd;
+ __u64 version = 0;
+
+ /*
+ * This uses standard index maintenance protocol:
+ *
+ * - search index under lock, and return object if found;
+ * - otherwise, unlock index, allocate new object;
+ * - lock index and search again;
+ * - if nothing is found (usual case), insert newly created
+ * object into index;
+ * - otherwise (race: other thread inserted object), free
+ * object just allocated.
+ * - unlock index;
+ * - return object.
+ *
+ * For "LOC_F_NEW" case, we are sure the object is new established.
+ * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+ * just alloc and insert directly.
+ *
+ * If dying object is found during index search, add @waiter to the
+ * site wait-queue and return ERR_PTR(-EAGAIN).
+ */
+ if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+ return lu_object_new(env, dev, f, conf);
+
+ s = dev->ld_site;
+ hs = s->ls_obj_hash;
+ cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+ o = htable_lookup(s, &bd, f, waiter, &version);
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT)
+ return o;
+
+ /*
+ * Allocate new object. This may result in rather complicated
+ * operations, including fld queries, inode loading, etc.
+ */
+ o = lu_object_alloc(env, dev, f, conf);
+ if (unlikely(IS_ERR(o)))
+ return o;
+
+ LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+ cfs_hash_bd_lock(hs, &bd, 1);
+
+ shadow = htable_lookup(s, &bd, f, waiter, &version);
+ if (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) {
+ struct lu_site_bkt_data *bkt;
+
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+ bkt->lsb_busy++;
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ return o;
+ }
+
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ lu_object_free(env, o);
+ return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_object *obj;
+ wait_queue_t wait;
+
+ while (1) {
+ obj = lu_object_find_try(env, dev, f, conf, &wait);
+ if (obj != ERR_PTR(-EAGAIN))
+ return obj;
+ /*
+ * lu_object_find_try() already added waiter into the
+ * wait queue.
+ */
+ schedule();
+ bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+ remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+ }
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_object *top;
+ struct lu_object *obj;
+
+ top = lu_object_find(env, dev, f, conf);
+ if (!IS_ERR(top)) {
+ obj = lu_object_locate(top->lo_header, dev->ld_type);
+ if (obj == NULL)
+ lu_object_put(env, top);
+ } else
+ obj = top;
+ return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+ int result = 0;
+
+ INIT_LIST_HEAD(&ldt->ldt_linkage);
+ if (ldt->ldt_ops->ldto_init)
+ result = ldt->ldt_ops->ldto_init(ldt);
+ if (result == 0)
+ list_add(&ldt->ldt_linkage, &lu_device_types);
+ return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+ list_del_init(&ldt->ldt_linkage);
+ if (ldt->ldt_ops->ldto_fini)
+ ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+ struct lu_device_type *ldt;
+
+ list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+ if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+ ldt->ldt_ops->ldto_stop(ldt);
+ }
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+ struct lu_env *lsp_env;
+ void *lsp_cookie;
+ lu_printer_t lsp_printer;
+};
+
+static int
+lu_site_obj_print(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *data)
+{
+ struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ if (!list_empty(&h->loh_layers)) {
+ const struct lu_object *o;
+
+ o = lu_object_top(h);
+ lu_object_print(arg->lsp_env, arg->lsp_cookie,
+ arg->lsp_printer, o);
+ } else {
+ lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+ arg->lsp_printer, h);
+ }
+ return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+ lu_printer_t printer)
+{
+ struct lu_site_print_arg arg = {
+ .lsp_env = (struct lu_env *)env,
+ .lsp_cookie = cookie,
+ .lsp_printer = printer,
+ };
+
+ cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+ LU_CACHE_PERCENT_MAX = 50,
+ LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+module_param(lu_cache_percent, int, 0644);
+MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+ unsigned long cache_size;
+ int bits;
+
+ /*
+ * Calculate hash table size, assuming that we want reasonable
+ * performance when 20% of total memory is occupied by cache of
+ * lu_objects.
+ *
+ * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+ */
+ cache_size = totalram_pages;
+
+#if BITS_PER_LONG == 32
+ /* limit hashtable size for lowmem systems to low RAM */
+ if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+ cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+ /* clear off unreasonable cache setting. */
+ if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+ CWARN("obdclass: invalid lu_cache_percent: %u, it must be in the range of (0, %u]. Will use default value: %u.\n",
+ lu_cache_percent, LU_CACHE_PERCENT_MAX,
+ LU_CACHE_PERCENT_DEFAULT);
+
+ lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+ }
+ cache_size = cache_size / 100 * lu_cache_percent *
+ (PAGE_CACHE_SIZE / 1024);
+
+ for (bits = 1; (1 << bits) < cache_size; ++bits) {
+ ;
+ }
+ return bits;
+}
+
+static unsigned lu_obj_hop_hash(struct cfs_hash *hs,
+ const void *key, unsigned mask)
+{
+ struct lu_fid *fid = (struct lu_fid *)key;
+ __u32 hash;
+
+ hash = fid_flatten32(fid);
+ hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+ hash = hash_long(hash, hs->hs_bkt_bits);
+
+ /* give me another random factor */
+ hash -= hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+ hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+ hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+ return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ if (atomic_add_return(1, &h->loh_ref) == 1) {
+ struct lu_site_bkt_data *bkt;
+ struct cfs_hash_bd bd;
+
+ cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ bkt->lsb_busy++;
+ }
+}
+
+static void lu_obj_hop_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+ .hs_hash = lu_obj_hop_hash,
+ .hs_key = lu_obj_hop_key,
+ .hs_keycmp = lu_obj_hop_keycmp,
+ .hs_object = lu_obj_hop_object,
+ .hs_get = lu_obj_hop_get,
+ .hs_put_locked = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+ spin_lock(&s->ls_ld_lock);
+ if (list_empty(&d->ld_linkage))
+ list_add(&d->ld_linkage, &s->ls_ld_linkage);
+ spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+ spin_lock(&s->ls_ld_lock);
+ list_del_init(&d->ld_linkage);
+ spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN 12
+#define LU_SITE_BITS_MAX 24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS 8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+ struct lu_site_bkt_data *bkt;
+ struct cfs_hash_bd bd;
+ char name[16];
+ int bits;
+ int i;
+
+ memset(s, 0, sizeof(*s));
+ bits = lu_htable_order();
+ snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+ for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+ bits >= LU_SITE_BITS_MIN; bits--) {
+ s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+ bits - LU_SITE_BKT_BITS,
+ sizeof(*bkt), 0, 0,
+ &lu_site_hash_ops,
+ CFS_HASH_SPIN_BKTLOCK |
+ CFS_HASH_NO_ITEMREF |
+ CFS_HASH_DEPTH |
+ CFS_HASH_ASSERT_EMPTY);
+ if (s->ls_obj_hash != NULL)
+ break;
+ }
+
+ if (s->ls_obj_hash == NULL) {
+ CERROR("failed to create lu_site hash with bits: %d\n", bits);
+ return -ENOMEM;
+ }
+
+ cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+ bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+ INIT_LIST_HEAD(&bkt->lsb_lru);
+ init_waitqueue_head(&bkt->lsb_marche_funebre);
+ }
+
+ s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+ if (s->ls_stats == NULL) {
+ cfs_hash_putref(s->ls_obj_hash);
+ s->ls_obj_hash = NULL;
+ return -ENOMEM;
+ }
+
+ lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+ 0, "created", "created");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+ 0, "cache_hit", "cache_hit");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+ 0, "cache_miss", "cache_miss");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+ 0, "cache_race", "cache_race");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+ 0, "cache_death_race", "cache_death_race");
+ lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+ 0, "lru_purged", "lru_purged");
+
+ INIT_LIST_HEAD(&s->ls_linkage);
+ s->ls_top_dev = top;
+ top->ld_site = s;
+ lu_device_get(top);
+ lu_ref_add(&top->ld_reference, "site-top", s);
+
+ INIT_LIST_HEAD(&s->ls_ld_linkage);
+ spin_lock_init(&s->ls_ld_lock);
+
+ lu_dev_add_linkage(s, top);
+
+ return 0;
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+ mutex_lock(&lu_sites_guard);
+ list_del_init(&s->ls_linkage);
+ mutex_unlock(&lu_sites_guard);
+
+ if (s->ls_obj_hash != NULL) {
+ cfs_hash_putref(s->ls_obj_hash);
+ s->ls_obj_hash = NULL;
+ }
+
+ if (s->ls_top_dev != NULL) {
+ s->ls_top_dev->ld_site = NULL;
+ lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+ lu_device_put(s->ls_top_dev);
+ s->ls_top_dev = NULL;
+ }
+
+ if (s->ls_stats != NULL)
+ lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+ int result;
+ mutex_lock(&lu_sites_guard);
+ result = lu_context_refill(&lu_shrink_env.le_ctx);
+ if (result == 0)
+ list_add(&s->ls_linkage, &lu_sites);
+ mutex_unlock(&lu_sites_guard);
+ return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+ atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+ LASSERT(atomic_read(&d->ld_ref) > 0);
+ atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+ if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+ t->ldt_ops->ldto_start(t);
+ memset(d, 0, sizeof(*d));
+ atomic_set(&d->ld_ref, 0);
+ d->ld_type = t;
+ lu_ref_init(&d->ld_reference);
+ INIT_LIST_HEAD(&d->ld_linkage);
+ return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+ struct lu_device_type *t;
+
+ t = d->ld_type;
+ if (d->ld_obd != NULL) {
+ d->ld_obd->obd_lu_dev = NULL;
+ d->ld_obd = NULL;
+ }
+
+ lu_ref_fini(&d->ld_reference);
+ LASSERTF(atomic_read(&d->ld_ref) == 0,
+ "Refcount is %u\n", atomic_read(&d->ld_ref));
+ LASSERT(t->ldt_device_nr > 0);
+ if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+ t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o, struct lu_object_header *h,
+ struct lu_device *d)
+{
+ memset(o, 0, sizeof(*o));
+ o->lo_header = h;
+ o->lo_dev = d;
+ lu_device_get(d);
+ lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o);
+ INIT_LIST_HEAD(&o->lo_linkage);
+
+ return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+ struct lu_device *dev = o->lo_dev;
+
+ LASSERT(list_empty(&o->lo_linkage));
+
+ if (dev != NULL) {
+ lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref,
+ "lu_object", o);
+ lu_device_put(dev);
+ o->lo_dev = NULL;
+ }
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+ list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+ list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+ memset(h, 0, sizeof(*h));
+ atomic_set(&h->loh_ref, 1);
+ INIT_HLIST_NODE(&h->loh_hash);
+ INIT_LIST_HEAD(&h->loh_lru);
+ INIT_LIST_HEAD(&h->loh_layers);
+ lu_ref_init(&h->loh_reference);
+ return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+ LASSERT(list_empty(&h->loh_layers));
+ LASSERT(list_empty(&h->loh_lru));
+ LASSERT(hlist_unhashed(&h->loh_hash));
+ lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+ const struct lu_device_type *dtype)
+{
+ struct lu_object *o;
+
+ list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+ if (o->lo_dev->ld_type == dtype)
+ return o;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+ struct lu_site *site = top->ld_site;
+ struct lu_device *scan;
+ struct lu_device *next;
+
+ lu_site_purge(env, site, ~0);
+ for (scan = top; scan != NULL; scan = next) {
+ next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+ lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+ lu_device_put(scan);
+ }
+
+ /* purge again. */
+ lu_site_purge(env, site, ~0);
+
+ for (scan = top; scan != NULL; scan = next) {
+ const struct lu_device_type *ldt = scan->ld_type;
+ struct obd_type *type;
+
+ next = ldt->ldt_ops->ldto_device_free(env, scan);
+ type = ldt->ldt_obd_type;
+ if (type != NULL) {
+ type->typ_refcnt--;
+ class_put_type(type);
+ }
+ }
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+ /**
+ * Maximal number of tld slots.
+ */
+ LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+ int result;
+ int i;
+
+ LASSERT(key->lct_init != NULL);
+ LASSERT(key->lct_fini != NULL);
+ LASSERT(key->lct_tags != 0);
+
+ result = -ENFILE;
+ spin_lock(&lu_keys_guard);
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+ if (lu_keys[i] == NULL) {
+ key->lct_index = i;
+ atomic_set(&key->lct_used, 1);
+ lu_keys[i] = key;
+ lu_ref_init(&key->lct_reference);
+ result = 0;
+ ++key_set_version;
+ break;
+ }
+ }
+ spin_unlock(&lu_keys_guard);
+ return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+ if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+ struct lu_context_key *key;
+
+ key = lu_keys[index];
+ LASSERT(key != NULL);
+ LASSERT(key->lct_fini != NULL);
+ LASSERT(atomic_read(&key->lct_used) > 1);
+
+ key->lct_fini(ctx, key, ctx->lc_value[index]);
+ lu_ref_del(&key->lct_reference, "ctx", ctx);
+ atomic_dec(&key->lct_used);
+
+ if ((ctx->lc_tags & LCT_NOREF) == 0) {
+#ifdef CONFIG_MODULE_UNLOAD
+ LINVRNT(module_refcount(key->lct_owner) > 0);
+#endif
+ module_put(key->lct_owner);
+ }
+ ctx->lc_value[index] = NULL;
+ }
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+ LASSERT(atomic_read(&key->lct_used) >= 1);
+ LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+ lu_context_key_quiesce(key);
+
+ ++key_set_version;
+ spin_lock(&lu_keys_guard);
+ key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+ if (lu_keys[key->lct_index]) {
+ lu_keys[key->lct_index] = NULL;
+ lu_ref_fini(&key->lct_reference);
+ }
+ spin_unlock(&lu_keys_guard);
+
+ LASSERTF(atomic_read(&key->lct_used) == 1,
+ "key has instances: %d\n",
+ atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+ struct lu_context_key *key = k;
+ va_list args;
+ int result;
+
+ va_start(args, k);
+ do {
+ result = lu_context_key_register(key);
+ if (result)
+ break;
+ key = va_arg(args, struct lu_context_key *);
+ } while (key != NULL);
+ va_end(args);
+
+ if (result != 0) {
+ va_start(args, k);
+ while (k != key) {
+ lu_context_key_degister(k);
+ k = va_arg(args, struct lu_context_key *);
+ }
+ va_end(args);
+ }
+
+ return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+ va_list args;
+
+ va_start(args, k);
+ do {
+ lu_context_key_degister(k);
+ k = va_arg(args, struct lu_context_key*);
+ } while (k != NULL);
+ va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+ va_list args;
+
+ va_start(args, k);
+ do {
+ lu_context_key_revive(k);
+ k = va_arg(args, struct lu_context_key*);
+ } while (k != NULL);
+ va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+ va_list args;
+
+ va_start(args, k);
+ do {
+ lu_context_key_quiesce(k);
+ k = va_arg(args, struct lu_context_key*);
+ } while (k != NULL);
+ va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+ const struct lu_context_key *key)
+{
+ LINVRNT(ctx->lc_state == LCS_ENTERED);
+ LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+ LASSERT(lu_keys[key->lct_index] == key);
+ return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+ struct lu_context *ctx;
+
+ if (!(key->lct_tags & LCT_QUIESCENT)) {
+ /*
+ * XXX layering violation.
+ */
+ key->lct_tags |= LCT_QUIESCENT;
+ /*
+ * XXX memory barrier has to go here.
+ */
+ spin_lock(&lu_keys_guard);
+ list_for_each_entry(ctx, &lu_context_remembered,
+ lc_remember)
+ key_fini(ctx, key->lct_index);
+ spin_unlock(&lu_keys_guard);
+ ++key_set_version;
+ }
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+ key->lct_tags &= ~LCT_QUIESCENT;
+ ++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+ int i;
+
+ if (ctx->lc_value == NULL)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+ key_fini(ctx, i);
+
+ OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof(ctx->lc_value[0]));
+ ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+ int i;
+
+ LINVRNT(ctx->lc_value != NULL);
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+ struct lu_context_key *key;
+
+ key = lu_keys[i];
+ if (ctx->lc_value[i] == NULL && key != NULL &&
+ (key->lct_tags & ctx->lc_tags) &&
+ /*
+ * Don't create values for a LCT_QUIESCENT key, as this
+ * will pin module owning a key.
+ */
+ !(key->lct_tags & LCT_QUIESCENT)) {
+ void *value;
+
+ LINVRNT(key->lct_init != NULL);
+ LINVRNT(key->lct_index == i);
+
+ value = key->lct_init(ctx, key);
+ if (unlikely(IS_ERR(value)))
+ return PTR_ERR(value);
+
+ if (!(ctx->lc_tags & LCT_NOREF))
+ try_module_get(key->lct_owner);
+ lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+ atomic_inc(&key->lct_used);
+ /*
+ * This is the only place in the code, where an
+ * element of ctx->lc_value[] array is set to non-NULL
+ * value.
+ */
+ ctx->lc_value[i] = value;
+ if (key->lct_exit != NULL)
+ ctx->lc_tags |= LCT_HAS_EXIT;
+ }
+ ctx->lc_version = key_set_version;
+ }
+ return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+ OBD_ALLOC(ctx->lc_value,
+ ARRAY_SIZE(lu_keys) * sizeof(ctx->lc_value[0]));
+ if (likely(ctx->lc_value != NULL))
+ return keys_fill(ctx);
+
+ return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+ int rc;
+
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->lc_state = LCS_INITIALIZED;
+ ctx->lc_tags = tags;
+ if (tags & LCT_REMEMBER) {
+ spin_lock(&lu_keys_guard);
+ list_add(&ctx->lc_remember, &lu_context_remembered);
+ spin_unlock(&lu_keys_guard);
+ } else {
+ INIT_LIST_HEAD(&ctx->lc_remember);
+ }
+
+ rc = keys_init(ctx);
+ if (rc != 0)
+ lu_context_fini(ctx);
+
+ return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+ LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+ ctx->lc_state = LCS_FINALIZED;
+
+ if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+ LASSERT(list_empty(&ctx->lc_remember));
+ keys_fini(ctx);
+
+ } else { /* could race with key degister */
+ spin_lock(&lu_keys_guard);
+ keys_fini(ctx);
+ list_del_init(&ctx->lc_remember);
+ spin_unlock(&lu_keys_guard);
+ }
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+ LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+ ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+ int i;
+
+ LINVRNT(ctx->lc_state == LCS_ENTERED);
+ ctx->lc_state = LCS_LEFT;
+ if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+ if (ctx->lc_value[i] != NULL) {
+ struct lu_context_key *key;
+
+ key = lu_keys[i];
+ LASSERT(key != NULL);
+ if (key->lct_exit != NULL)
+ key->lct_exit(ctx,
+ key, ctx->lc_value[i]);
+ }
+ }
+ }
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+ return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_context_tags_default |= tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_context_tags_default &= ~tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_session_tags_default |= tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_session_tags_default &= ~tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+ int result;
+
+ env->le_ses = NULL;
+ result = lu_context_init(&env->le_ctx, tags);
+ if (likely(result == 0))
+ lu_context_enter(&env->le_ctx);
+ return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+ lu_context_exit(&env->le_ctx);
+ lu_context_fini(&env->le_ctx);
+ env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+ int result;
+
+ result = lu_context_refill(&env->le_ctx);
+ if (result == 0 && env->le_ses != NULL)
+ result = lu_context_refill(env->le_ses);
+ return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+ __u32 stags)
+{
+ if ((env->le_ctx.lc_tags & ctags) != ctags) {
+ env->le_ctx.lc_version = 0;
+ env->le_ctx.lc_tags |= ctags;
+ }
+
+ if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+ env->le_ses->lc_version = 0;
+ env->le_ses->lc_tags |= stags;
+ }
+
+ return lu_env_refill(env);
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+
+typedef struct lu_site_stats{
+ unsigned lss_populated;
+ unsigned lss_max_search;
+ unsigned lss_total;
+ unsigned lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(struct cfs_hash *hs,
+ lu_site_stats_t *stats, int populated)
+{
+ struct cfs_hash_bd bd;
+ int i;
+
+ cfs_hash_for_each_bucket(hs, &bd, i) {
+ struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+ struct hlist_head *hhead;
+
+ cfs_hash_bd_lock(hs, &bd, 1);
+ stats->lss_busy += bkt->lsb_busy;
+ stats->lss_total += cfs_hash_bd_count_get(&bd);
+ stats->lss_max_search = max((int)stats->lss_max_search,
+ cfs_hash_bd_depmax_get(&bd));
+ if (!populated) {
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ continue;
+ }
+
+ cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+ if (!hlist_empty(hhead))
+ stats->lss_populated++;
+ }
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ }
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static unsigned long lu_cache_shrink_count(struct shrinker *sk,
+ struct shrink_control *sc)
+{
+ lu_site_stats_t stats;
+ struct lu_site *s;
+ struct lu_site *tmp;
+ unsigned long cached = 0;
+
+ if (!(sc->gfp_mask & __GFP_FS))
+ return 0;
+
+ mutex_lock(&lu_sites_guard);
+ list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+ memset(&stats, 0, sizeof(stats));
+ lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+ cached += stats.lss_total - stats.lss_busy;
+ }
+ mutex_unlock(&lu_sites_guard);
+
+ cached = (cached / 100) * sysctl_vfs_cache_pressure;
+ CDEBUG(D_INODE, "%ld objects cached\n", cached);
+ return cached;
+}
+
+static unsigned long lu_cache_shrink_scan(struct shrinker *sk,
+ struct shrink_control *sc)
+{
+ struct lu_site *s;
+ struct lu_site *tmp;
+ unsigned long remain = sc->nr_to_scan, freed = 0;
+ LIST_HEAD(splice);
+
+ if (!(sc->gfp_mask & __GFP_FS))
+ /* We must not take the lu_sites_guard lock when
+ * __GFP_FS is *not* set because of the deadlock
+ * possibility detailed above. Additionally,
+ * since we cannot determine the number of
+ * objects in the cache without taking this
+ * lock, we're in a particularly tough spot. As
+ * a result, we'll just lie and say our cache is
+ * empty. This _should_ be ok, as we can't
+ * reclaim objects when __GFP_FS is *not* set
+ * anyways.
+ */
+ return SHRINK_STOP;
+
+ mutex_lock(&lu_sites_guard);
+ list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+ freed = lu_site_purge(&lu_shrink_env, s, remain);
+ remain -= freed;
+ /*
+ * Move just shrunk site to the tail of site list to
+ * assure shrinking fairness.
+ */
+ list_move_tail(&s->ls_linkage, &splice);
+ }
+ list_splice(&splice, lu_sites.prev);
+ mutex_unlock(&lu_sites_guard);
+
+ return sc->nr_to_scan - remain;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+ void *unused, const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+ vprintk(format, args);
+ va_end(args);
+ return 0;
+}
+
+static struct shrinker lu_site_shrinker = {
+ .count_objects = lu_cache_shrink_count,
+ .scan_objects = lu_cache_shrink_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+ int result;
+
+ CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+ result = lu_ref_global_init();
+ if (result != 0)
+ return result;
+
+ LU_CONTEXT_KEY_INIT(&lu_global_key);
+ result = lu_context_key_register(&lu_global_key);
+ if (result != 0)
+ return result;
+
+ /*
+ * At this level, we don't know what tags are needed, so allocate them
+ * conservatively. This should not be too bad, because this
+ * environment is global.
+ */
+ mutex_lock(&lu_sites_guard);
+ result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+ mutex_unlock(&lu_sites_guard);
+ if (result != 0)
+ return result;
+
+ /*
+ * seeks estimation: 3 seeks to read a record from oi, one to read
+ * inode, one for ea. Unfortunately setting this high value results in
+ * lu_object/inode cache consuming all the memory.
+ */
+ register_shrinker(&lu_site_shrinker);
+
+ return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+ unregister_shrinker(&lu_site_shrinker);
+ lu_context_key_degister(&lu_global_key);
+
+ /*
+ * Tear shrinker environment down _after_ de-registering
+ * lu_global_key, because the latter has a value in the former.
+ */
+ mutex_lock(&lu_sites_guard);
+ lu_env_fini(&lu_shrink_env);
+ mutex_unlock(&lu_sites_guard);
+
+ lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#if defined (CONFIG_PROC_FS)
+ struct lprocfs_counter ret;
+
+ lprocfs_stats_collect(stats, idx, &ret);
+ return (__u32)ret.lc_count;
+#else
+ return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
+{
+ lu_site_stats_t stats;
+
+ memset(&stats, 0, sizeof(stats));
+ lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+ seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+ stats.lss_busy,
+ stats.lss_total,
+ stats.lss_populated,
+ CFS_HASH_NHLIST(s->ls_obj_hash),
+ stats.lss_max_search,
+ ls_stats_read(s->ls_stats, LU_SS_CREATED),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+ ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+ return 0;
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+ int result;
+ struct lu_kmem_descr *iter = caches;
+
+ for (result = 0; iter->ckd_cache != NULL; ++iter) {
+ *iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+ iter->ckd_size,
+ 0, 0, NULL);
+ if (*iter->ckd_cache == NULL) {
+ result = -ENOMEM;
+ /* free all previously allocated caches */
+ lu_kmem_fini(caches);
+ break;
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+ for (; caches->ckd_cache != NULL; ++caches) {
+ if (*caches->ckd_cache != NULL) {
+ kmem_cache_destroy(*caches->ckd_cache);
+ *caches->ckd_cache = NULL;
+ }
+ }
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+ const struct lu_fid *fid)
+{
+ struct lu_site *s = o->lo_dev->ld_site;
+ struct lu_fid *old = &o->lo_header->loh_fid;
+ struct lu_site_bkt_data *bkt;
+ struct lu_object *shadow;
+ wait_queue_t waiter;
+ struct cfs_hash *hs;
+ struct cfs_hash_bd bd;
+ __u64 version = 0;
+
+ LASSERT(fid_is_zero(old));
+
+ hs = s->ls_obj_hash;
+ cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+ shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+ /* supposed to be unique */
+ LASSERT(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT);
+ *old = *fid;
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+ bkt->lsb_busy++;
+ cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assigned) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_object_conf *conf)
+{
+ struct lu_fid fid;
+ struct lu_object *o;
+
+ fid_zero(&fid);
+ o = lu_object_alloc(env, dev, &fid, conf);
+
+ return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+ .lb_buf = NULL,
+ .lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+ LASSERT(buf);
+ if (buf->lb_buf) {
+ LASSERT(buf->lb_len > 0);
+ OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+ buf->lb_buf = NULL;
+ buf->lb_len = 0;
+ }
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+ LASSERT(buf);
+ LASSERT(buf->lb_buf == NULL);
+ LASSERT(buf->lb_len == 0);
+ OBD_ALLOC_LARGE(buf->lb_buf, size);
+ if (likely(buf->lb_buf))
+ buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+ lu_buf_free(buf);
+ lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+ if (buf->lb_buf == NULL && buf->lb_len == 0)
+ lu_buf_alloc(buf, len);
+
+ if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+ lu_buf_realloc(buf, len);
+
+ return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+ char *ptr;
+
+ if (len <= buf->lb_len)
+ return 0;
+
+ OBD_ALLOC_LARGE(ptr, len);
+ if (ptr == NULL)
+ return -ENOMEM;
+
+ /* Free the old buf */
+ if (buf->lb_buf != NULL) {
+ memcpy(ptr, buf->lb_buf, buf->lb_len);
+ OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+ }
+
+ buf->lb_buf = ptr;
+ buf->lb_len = len;
+ return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644
index 000000000..993697b66
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lu_ref.h"
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644
index 000000000..f720e3183
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
@@ -0,0 +1,257 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_support.h"
+#include "../include/lustre_handles.h"
+#include "../include/lustre_lib.h"
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+ spinlock_t lock;
+ struct list_head head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+ struct portals_handle_ops *ops)
+{
+ struct handle_bucket *bucket;
+
+ LASSERT(h != NULL);
+ LASSERT(list_empty(&h->h_link));
+
+ /*
+ * This is fast, but simplistic cookie generation algorithm, it will
+ * need a re-do at some point in the future for security.
+ */
+ spin_lock(&handle_base_lock);
+ handle_base += HANDLE_INCR;
+
+ if (unlikely(handle_base == 0)) {
+ /*
+ * Cookie of zero is "dangerous", because in many places it's
+ * assumed that 0 means "unassigned" handle, not bound to any
+ * object.
+ */
+ CWARN("The universe has been exhausted: cookie wrap-around.\n");
+ handle_base += HANDLE_INCR;
+ }
+ h->h_cookie = handle_base;
+ spin_unlock(&handle_base_lock);
+
+ h->h_ops = ops;
+ spin_lock_init(&h->h_lock);
+
+ bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+ spin_lock(&bucket->lock);
+ list_add_rcu(&h->h_link, &bucket->head);
+ h->h_in = 1;
+ spin_unlock(&bucket->lock);
+
+ CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n",
+ h, h->h_cookie);
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+ if (list_empty(&h->h_link)) {
+ CERROR("removing an already-removed handle (%#llx)\n",
+ h->h_cookie);
+ return;
+ }
+
+ CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n",
+ h, h->h_cookie);
+
+ spin_lock(&h->h_lock);
+ if (h->h_in == 0) {
+ spin_unlock(&h->h_lock);
+ return;
+ }
+ h->h_in = 0;
+ spin_unlock(&h->h_lock);
+ list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+ struct handle_bucket *bucket;
+ bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+ spin_lock(&bucket->lock);
+ class_handle_unhash_nolock(h);
+ spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+ struct handle_bucket *bucket;
+
+ bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+ spin_lock(&bucket->lock);
+ list_add_rcu(&h->h_link, &bucket->head);
+ h->h_in = 1;
+ spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+ struct handle_bucket *bucket;
+ struct portals_handle *h;
+ void *retval = NULL;
+
+ LASSERT(handle_hash != NULL);
+
+ /* Be careful when you want to change this code. See the
+ * rcu_read_lock() definition on top this file. - jxiong */
+ bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(h, &bucket->head, h_link) {
+ if (h->h_cookie != cookie)
+ continue;
+
+ spin_lock(&h->h_lock);
+ if (likely(h->h_in != 0)) {
+ h->h_ops->hop_addref(h);
+ retval = h;
+ }
+ spin_unlock(&h->h_lock);
+ break;
+ }
+ rcu_read_unlock();
+
+ return retval;
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(struct rcu_head *rcu)
+{
+ struct portals_handle *h = RCU2HANDLE(rcu);
+ void *ptr = (void *)(unsigned long)h->h_cookie;
+
+ if (h->h_ops->hop_free != NULL)
+ h->h_ops->hop_free(ptr, h->h_size);
+ else
+ OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+ struct handle_bucket *bucket;
+ struct timeval tv;
+ int seed[2];
+
+ LASSERT(handle_hash == NULL);
+
+ OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+ if (handle_hash == NULL)
+ return -ENOMEM;
+
+ spin_lock_init(&handle_base_lock);
+ for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+ bucket--) {
+ INIT_LIST_HEAD(&bucket->head);
+ spin_lock_init(&bucket->lock);
+ }
+
+ /** bug 21430: add randomness to the initial base */
+ cfs_get_random_bytes(seed, sizeof(seed));
+ do_gettimeofday(&tv);
+ cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+ cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+ LASSERT(handle_base != 0ULL);
+
+ return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+ int rc;
+ int i;
+
+ for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+ struct portals_handle *h;
+
+ spin_lock(&handle_hash[i].lock);
+ list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+ CERROR("force clean handle %#llx addr %p ops %p\n",
+ h->h_cookie, h, h->h_ops);
+
+ class_handle_unhash_nolock(h);
+ rc++;
+ }
+ spin_unlock(&handle_hash[i].lock);
+ }
+
+ return rc;
+}
+
+void class_handle_cleanup(void)
+{
+ int count;
+ LASSERT(handle_hash != NULL);
+
+ count = cleanup_all_handles();
+
+ OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+ handle_hash = NULL;
+
+ if (count != 0)
+ CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644
index 000000000..64b2f35e2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
@@ -0,0 +1,217 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_net.h"
+#include "../include/lprocfs_status.h"
+
+#define NIDS_MAX 32
+
+struct uuid_nid_data {
+ struct list_head un_list;
+ struct obd_uuid un_uuid;
+ int un_nid_count;
+ lnet_nid_t un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head g_uuid_list;
+static spinlock_t g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+ INIT_LIST_HEAD(&g_uuid_list);
+ spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+ /* delete all */
+ class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+ struct uuid_nid_data *data;
+ struct obd_uuid tmp;
+ int rc = -ENOENT;
+
+ obd_str2uuid(&tmp, uuid);
+ spin_lock(&g_uuid_lock);
+ list_for_each_entry(data, &g_uuid_list, un_list) {
+ if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+ if (index >= data->un_nid_count)
+ break;
+
+ rc = 0;
+ *peer_nid = data->un_nids[index];
+ break;
+ }
+ }
+ spin_unlock(&g_uuid_lock);
+ return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid. Multiple nids can be added to a single uuid;
+ LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+ struct uuid_nid_data *data, *entry;
+ int found = 0;
+
+ LASSERT(nid != 0); /* valid newconfig NID is never zero */
+
+ if (strlen(uuid) > UUID_MAX - 1)
+ return -EOVERFLOW;
+
+ OBD_ALLOC_PTR(data);
+ if (data == NULL)
+ return -ENOMEM;
+
+ obd_str2uuid(&data->un_uuid, uuid);
+ data->un_nids[0] = nid;
+ data->un_nid_count = 1;
+
+ spin_lock(&g_uuid_lock);
+ list_for_each_entry(entry, &g_uuid_list, un_list) {
+ if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+ int i;
+
+ found = 1;
+ for (i = 0; i < entry->un_nid_count; i++)
+ if (nid == entry->un_nids[i])
+ break;
+
+ if (i == entry->un_nid_count) {
+ LASSERT(entry->un_nid_count < NIDS_MAX);
+ entry->un_nids[entry->un_nid_count++] = nid;
+ }
+ break;
+ }
+ }
+ if (!found)
+ list_add(&data->un_list, &g_uuid_list);
+ spin_unlock(&g_uuid_lock);
+
+ if (found) {
+ CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+ libcfs_nid2str(nid), entry->un_nid_count);
+ OBD_FREE(data, sizeof(*data));
+ } else {
+ CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+ }
+ return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+ LIST_HEAD(deathrow);
+ struct uuid_nid_data *data;
+
+ spin_lock(&g_uuid_lock);
+ if (uuid != NULL) {
+ struct obd_uuid tmp;
+
+ obd_str2uuid(&tmp, uuid);
+ list_for_each_entry(data, &g_uuid_list, un_list) {
+ if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+ list_move(&data->un_list, &deathrow);
+ break;
+ }
+ }
+ } else
+ list_splice_init(&g_uuid_list, &deathrow);
+ spin_unlock(&g_uuid_lock);
+
+ if (uuid != NULL && list_empty(&deathrow)) {
+ CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+ return -EINVAL;
+ }
+
+ while (!list_empty(&deathrow)) {
+ data = list_entry(deathrow.next, struct uuid_nid_data,
+ un_list);
+ list_del(&data->un_list);
+
+ CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+ obd_uuid2str(&data->un_uuid),
+ libcfs_nid2str(data->un_nids[0]),
+ data->un_nid_count);
+
+ OBD_FREE(data, sizeof(*data));
+ }
+
+ return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+ struct uuid_nid_data *entry;
+ int found = 0;
+
+ CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+ obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+ spin_lock(&g_uuid_lock);
+ list_for_each_entry(entry, &g_uuid_list, un_list) {
+ int i;
+
+ if (!obd_uuid_equals(&entry->un_uuid, uuid))
+ continue;
+
+ /* found the uuid, check if it has @nid */
+ for (i = 0; i < entry->un_nid_count; i++) {
+ if (entry->un_nids[i] == nid) {
+ found = 1;
+ break;
+ }
+ }
+ break;
+ }
+ spin_unlock(&g_uuid_lock);
+ return found;
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644
index 000000000..6ce9adc2f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c
@@ -0,0 +1,1953 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include "../include/obd_class.h"
+#include <linux/string.h>
+#include "../include/lustre_log.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_param.h"
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+ char *ptr;
+
+ if (!buf)
+ return 1;
+
+ ptr = strstr(buf, key);
+ if (ptr == NULL)
+ return 1;
+
+ if (valp)
+ *valp = ptr + strlen(key);
+
+ return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param proc parameter
+ * \param ptr an array which contains the mapping from
+ * old parameters to new ones
+ *
+ * \retval valid-pointer pointer to the cfg_interop_param structure
+ * which contains the old and new parameters
+ * \retval NULL \a param or \a ptr is NULL,
+ * or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+ struct cfg_interop_param *ptr)
+{
+ char *value = NULL;
+ int name_len = 0;
+
+ if (param == NULL || ptr == NULL)
+ return NULL;
+
+ value = strchr(param, '=');
+ if (value == NULL)
+ name_len = strlen(param);
+ else
+ name_len = value - param;
+
+ while (ptr->old_param != NULL) {
+ if (strncmp(param, ptr->old_param, name_len) == 0 &&
+ name_len == strlen(ptr->old_param))
+ return ptr;
+ ptr++;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+ char *q1, *q2, *str;
+ int len;
+
+ str = *params;
+ while (*str == ' ')
+ str++;
+
+ if (*str == '\0') {
+ *params = NULL;
+ return 1;
+ }
+
+ while (1) {
+ q1 = strpbrk(str, " '\"");
+ if (q1 == NULL) {
+ len = strlen(str);
+ memcpy(copy, str, len);
+ copy[len] = '\0';
+ *params = NULL;
+ return 0;
+ }
+ len = q1 - str;
+ if (*q1 == ' ') {
+ memcpy(copy, str, len);
+ copy[len] = '\0';
+ *params = str + len;
+ return 0;
+ }
+
+ memcpy(copy, str, len);
+ copy += len;
+
+ /* search for the matching closing quote */
+ str = q1 + 1;
+ q2 = strchr(str, *q1);
+ if (q2 == NULL) {
+ CERROR("Unbalanced quota in parameters: \"%s\"\n",
+ *params);
+ return -EINVAL;
+ }
+ len = q2 - str;
+ memcpy(copy, str, len);
+ copy += len;
+ str = q2 + 1;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+ valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+ if (!buf)
+ return 1;
+
+ if (memcmp(buf, key, strlen(key)) != 0)
+ return 1;
+
+ if (valp)
+ *valp = buf + strlen(key);
+
+ return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+ lnet_nid_t *nid = (lnet_nid_t *)value;
+
+ *nid = libcfs_str2nid(buf);
+ if (*nid != LNET_NID_ANY)
+ return 0;
+
+ if (!quiet)
+ LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+ return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+ __u32 *net = (__u32 *)value;
+
+ *net = libcfs_str2net(buf);
+ CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+ return 0;
+}
+
+enum {
+ CLASS_PARSE_NID = 1,
+ CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+ 1 not found
+ < 0 error
+ endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+ int quiet)
+{
+ char *endp;
+ char tmp;
+ int rc = 0;
+
+ if (!buf)
+ return 1;
+ while (*buf == ',' || *buf == ':')
+ buf++;
+ if (*buf == ' ' || *buf == '/' || *buf == '\0')
+ return 1;
+
+ /* nid separators or end of nids */
+ endp = strpbrk(buf, ",: /");
+ if (endp == NULL)
+ endp = buf + strlen(buf);
+
+ tmp = *endp;
+ *endp = '\0';
+ switch (opc) {
+ default:
+ LBUG();
+ case CLASS_PARSE_NID:
+ rc = parse_nid(buf, value, quiet);
+ break;
+ case CLASS_PARSE_NET:
+ rc = parse_net(buf, value);
+ break;
+ }
+ *endp = tmp;
+ if (rc != 0)
+ return rc;
+ if (endh)
+ *endh = endp;
+ return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+ return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+ return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+ return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+ lnet_nid_t tmp;
+ int rc = -1;
+
+ while (class_find_param(buf, key, &buf) == 0) {
+ /* please restrict to the nids pertaining to
+ * the specified nids */
+ while (class_parse_nid(buf, &tmp, &buf) == 0) {
+ if (tmp == nid)
+ return 1;
+ }
+ rc = 0;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+ __u32 tmp;
+ int rc = -1;
+
+ while (class_find_param(buf, key, &buf) == 0) {
+ /* please restrict to the nids pertaining to
+ * the specified networks */
+ while (class_parse_net(buf, &tmp, &buf) == 0) {
+ if (tmp == net)
+ return 1;
+ }
+ rc = 0;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid. If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+ struct obd_device *obd = NULL;
+ char *typename, *name, *uuid;
+ int rc, len;
+
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+ CERROR("No type passed!\n");
+ return -EINVAL;
+ }
+ typename = lustre_cfg_string(lcfg, 1);
+
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+ CERROR("No name passed!\n");
+ return -EINVAL;
+ }
+ name = lustre_cfg_string(lcfg, 0);
+
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+ CERROR("No UUID passed!\n");
+ return -EINVAL;
+ }
+ uuid = lustre_cfg_string(lcfg, 2);
+
+ CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+ MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+ obd = class_newdev(typename, name);
+ if (IS_ERR(obd)) {
+ /* Already exists or out of obds */
+ rc = PTR_ERR(obd);
+ obd = NULL;
+ CERROR("Cannot create device %s of type %s : %d\n",
+ name, typename, rc);
+ goto out;
+ }
+ LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+ name, typename);
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+ "obd %p obd_magic %08X != %08X\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+ "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+ rwlock_init(&obd->obd_pool_lock);
+ obd->obd_pool_limit = 0;
+ obd->obd_pool_slv = 0;
+
+ INIT_LIST_HEAD(&obd->obd_exports);
+ INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+ INIT_LIST_HEAD(&obd->obd_delayed_exports);
+ INIT_LIST_HEAD(&obd->obd_exports_timed);
+ INIT_LIST_HEAD(&obd->obd_nid_stats);
+ spin_lock_init(&obd->obd_nid_lock);
+ spin_lock_init(&obd->obd_dev_lock);
+ mutex_init(&obd->obd_dev_mutex);
+ spin_lock_init(&obd->obd_osfs_lock);
+ /* obd->obd_osfs_age must be set to a value in the distant
+ * past to guarantee a fresh statfs is fetched on mount. */
+ obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+ /* XXX belongs in setup not attach */
+ init_rwsem(&obd->obd_observer_link_sem);
+ /* recovery data */
+ cfs_init_timer(&obd->obd_recovery_timer);
+ spin_lock_init(&obd->obd_recovery_task_lock);
+ init_waitqueue_head(&obd->obd_next_transno_waitq);
+ init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+ INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+ INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+ INIT_LIST_HEAD(&obd->obd_final_req_queue);
+ INIT_LIST_HEAD(&obd->obd_evict_list);
+
+ llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+ obd->obd_conn_inprogress = 0;
+
+ len = strlen(uuid);
+ if (len >= sizeof(obd->obd_uuid)) {
+ CERROR("uuid must be < %d bytes long\n",
+ (int)sizeof(obd->obd_uuid));
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(obd->obd_uuid.uuid, uuid, len);
+
+ /* do the attach */
+ if (OBP(obd, attach)) {
+ rc = OBP(obd, attach)(obd, sizeof(*lcfg), lcfg);
+ if (rc) {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Detach drops this */
+ spin_lock(&obd->obd_dev_lock);
+ atomic_set(&obd->obd_refcount, 1);
+ spin_unlock(&obd->obd_dev_lock);
+ lu_ref_init(&obd->obd_reference);
+ lu_ref_add(&obd->obd_reference, "attach", obd);
+
+ obd->obd_attached = 1;
+ CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+ obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+ return 0;
+ out:
+ if (obd != NULL) {
+ class_release_dev(obd);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ int err = 0;
+ struct obd_export *exp;
+
+ LASSERT(obd != NULL);
+ LASSERTF(obd == class_num2obd(obd->obd_minor),
+ "obd %p != obd_devs[%d] %p\n",
+ obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+ "obd %p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+ /* have we attached a type to this device? */
+ if (!obd->obd_attached) {
+ CERROR("Device %d not attached\n", obd->obd_minor);
+ return -ENODEV;
+ }
+
+ if (obd->obd_set_up) {
+ CERROR("Device %d already setup (type %s)\n",
+ obd->obd_minor, obd->obd_type->typ_name);
+ return -EEXIST;
+ }
+
+ /* is someone else setting us up right now? (attach inits spinlock) */
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_starting) {
+ spin_unlock(&obd->obd_dev_lock);
+ CERROR("Device %d setup in progress (type %s)\n",
+ obd->obd_minor, obd->obd_type->typ_name);
+ return -EEXIST;
+ }
+ /* just leave this on forever. I can't use obd_set_up here because
+ other fns check that status, and we're not actually set up yet. */
+ obd->obd_starting = 1;
+ obd->obd_uuid_hash = NULL;
+ obd->obd_nid_hash = NULL;
+ obd->obd_nid_stats_hash = NULL;
+ spin_unlock(&obd->obd_dev_lock);
+
+ /* create an uuid-export lustre hash */
+ obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+ HASH_UUID_CUR_BITS,
+ HASH_UUID_MAX_BITS,
+ HASH_UUID_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &uuid_hash_ops, CFS_HASH_DEFAULT);
+ if (!obd->obd_uuid_hash) {
+ err = -ENOMEM;
+ goto err_hash;
+ }
+
+ /* create a nid-export lustre hash */
+ obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+ HASH_NID_CUR_BITS,
+ HASH_NID_MAX_BITS,
+ HASH_NID_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &nid_hash_ops, CFS_HASH_DEFAULT);
+ if (!obd->obd_nid_hash) {
+ err = -ENOMEM;
+ goto err_hash;
+ }
+
+ /* create a nid-stats lustre hash */
+ obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+ HASH_NID_STATS_CUR_BITS,
+ HASH_NID_STATS_MAX_BITS,
+ HASH_NID_STATS_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+ if (!obd->obd_nid_stats_hash) {
+ err = -ENOMEM;
+ goto err_hash;
+ }
+
+ exp = class_new_export(obd, &obd->obd_uuid);
+ if (IS_ERR(exp)) {
+ err = PTR_ERR(exp);
+ goto err_hash;
+ }
+
+ obd->obd_self_export = exp;
+ list_del_init(&exp->exp_obd_chain_timed);
+ class_export_put(exp);
+
+ err = obd_setup(obd, lcfg);
+ if (err)
+ goto err_exp;
+
+ obd->obd_set_up = 1;
+
+ spin_lock(&obd->obd_dev_lock);
+ /* cleanup drops this */
+ class_incref(obd, "setup", obd);
+ spin_unlock(&obd->obd_dev_lock);
+
+ CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+ obd->obd_name, obd->obd_uuid.uuid);
+
+ return 0;
+err_exp:
+ if (obd->obd_self_export) {
+ class_unlink_export(obd->obd_self_export);
+ obd->obd_self_export = NULL;
+ }
+err_hash:
+ if (obd->obd_uuid_hash) {
+ cfs_hash_putref(obd->obd_uuid_hash);
+ obd->obd_uuid_hash = NULL;
+ }
+ if (obd->obd_nid_hash) {
+ cfs_hash_putref(obd->obd_nid_hash);
+ obd->obd_nid_hash = NULL;
+ }
+ if (obd->obd_nid_stats_hash) {
+ cfs_hash_putref(obd->obd_nid_stats_hash);
+ obd->obd_nid_stats_hash = NULL;
+ }
+ obd->obd_starting = 0;
+ CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+ return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ if (obd->obd_set_up) {
+ CERROR("OBD device %d still set up\n", obd->obd_minor);
+ return -EBUSY;
+ }
+
+ spin_lock(&obd->obd_dev_lock);
+ if (!obd->obd_attached) {
+ spin_unlock(&obd->obd_dev_lock);
+ CERROR("OBD device %d not attached\n", obd->obd_minor);
+ return -ENODEV;
+ }
+ obd->obd_attached = 0;
+ spin_unlock(&obd->obd_dev_lock);
+
+ CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+ obd->obd_name, obd->obd_uuid.uuid);
+
+ class_decref(obd, "attach", obd);
+ return 0;
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd. There may be in-progress ops when
+ * this is called. We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ int err = 0;
+ char *flag;
+
+ OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+ if (!obd->obd_set_up) {
+ CERROR("Device %d not setup\n", obd->obd_minor);
+ return -ENODEV;
+ }
+
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_stopping) {
+ spin_unlock(&obd->obd_dev_lock);
+ CERROR("OBD %d already stopping\n", obd->obd_minor);
+ return -ENODEV;
+ }
+ /* Leave this on forever */
+ obd->obd_stopping = 1;
+
+ /* wait for already-arrived-connections to finish. */
+ while (obd->obd_conn_inprogress > 0) {
+ spin_unlock(&obd->obd_dev_lock);
+
+ cond_resched();
+
+ spin_lock(&obd->obd_dev_lock);
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+ for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+ switch (*flag) {
+ case 'F':
+ obd->obd_force = 1;
+ break;
+ case 'A':
+ LCONSOLE_WARN("Failing over %s\n",
+ obd->obd_name);
+ obd->obd_fail = 1;
+ obd->obd_no_transno = 1;
+ obd->obd_no_recov = 1;
+ if (OBP(obd, iocontrol)) {
+ obd_iocontrol(OBD_IOC_SYNC,
+ obd->obd_self_export,
+ 0, NULL, NULL);
+ }
+ break;
+ default:
+ CERROR("Unrecognised flag '%c'\n", *flag);
+ }
+ }
+
+ LASSERT(obd->obd_self_export);
+
+ /* The three references that should be remaining are the
+ * obd_self_export and the attach and setup references. */
+ if (atomic_read(&obd->obd_refcount) > 3) {
+ /* refcount - 3 might be the number of real exports
+ (excluding self export). But class_incref is called
+ by other things as well, so don't count on it. */
+ CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+ obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+ dump_exports(obd, 0);
+ class_disconnect_exports(obd);
+ }
+
+ /* Precleanup, we must make sure all exports get destroyed. */
+ err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+ if (err)
+ CERROR("Precleanup %s returned %d\n",
+ obd->obd_name, err);
+
+ /* destroy an uuid-export hash body */
+ if (obd->obd_uuid_hash) {
+ cfs_hash_putref(obd->obd_uuid_hash);
+ obd->obd_uuid_hash = NULL;
+ }
+
+ /* destroy a nid-export hash body */
+ if (obd->obd_nid_hash) {
+ cfs_hash_putref(obd->obd_nid_hash);
+ obd->obd_nid_hash = NULL;
+ }
+
+ /* destroy a nid-stats hash body */
+ if (obd->obd_nid_stats_hash) {
+ cfs_hash_putref(obd->obd_nid_stats_hash);
+ obd->obd_nid_stats_hash = NULL;
+ }
+
+ class_decref(obd, "setup", obd);
+ obd->obd_set_up = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+ const char *scope, const void *source)
+{
+ lu_ref_add_atomic(&obd->obd_reference, scope, source);
+ atomic_inc(&obd->obd_refcount);
+ CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+ atomic_read(&obd->obd_refcount));
+
+ return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+ int err;
+ int refs;
+
+ spin_lock(&obd->obd_dev_lock);
+ atomic_dec(&obd->obd_refcount);
+ refs = atomic_read(&obd->obd_refcount);
+ spin_unlock(&obd->obd_dev_lock);
+ lu_ref_del(&obd->obd_reference, scope, source);
+
+ CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+ if ((refs == 1) && obd->obd_stopping) {
+ /* All exports have been destroyed; there should
+ be no more in-progress ops by this point.*/
+
+ spin_lock(&obd->obd_self_export->exp_lock);
+ obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+ spin_unlock(&obd->obd_self_export->exp_lock);
+
+ /* note that we'll recurse into class_decref again */
+ class_unlink_export(obd->obd_self_export);
+ return;
+ }
+
+ if (refs == 0) {
+ CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+ obd->obd_name, obd->obd_uuid.uuid);
+ LASSERT(!obd->obd_attached);
+ if (obd->obd_stopping) {
+ /* If we're not stopping, we were never set up */
+ err = obd_cleanup(obd);
+ if (err)
+ CERROR("Cleanup %s returned %d\n",
+ obd->obd_name, err);
+ }
+ if (OBP(obd, detach)) {
+ err = OBP(obd, detach)(obd);
+ if (err)
+ CERROR("Detach returned %d\n", err);
+ }
+ class_release_dev(obd);
+ }
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct obd_import *imp;
+ struct obd_uuid uuid;
+ int rc;
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+ LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+ CERROR("invalid conn_uuid\n");
+ return -EINVAL;
+ }
+ if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+ CERROR("can't add connection on non-client dev\n");
+ return -EINVAL;
+ }
+
+ imp = obd->u.cli.cl_import;
+ if (!imp) {
+ CERROR("try to add conn on immature client dev\n");
+ return -EINVAL;
+ }
+
+ obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+ rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+ return rc;
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct obd_import *imp;
+ struct obd_uuid uuid;
+ int rc;
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+ LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+ CERROR("invalid conn_uuid\n");
+ return -EINVAL;
+ }
+ if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+ CERROR("can't del connection on non-client dev\n");
+ return -EINVAL;
+ }
+
+ imp = obd->u.cli.cl_import;
+ if (!imp) {
+ CERROR("try to del conn on immature client dev\n");
+ return -EINVAL;
+ }
+
+ obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+ rc = obd_del_conn(imp, &uuid);
+
+ return rc;
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char *prof)
+{
+ struct lustre_profile *lprof;
+
+ list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+ if (!strcmp(lprof->lp_profile, prof)) {
+ return lprof;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+ int mdclen, char *mdc)
+{
+ struct lustre_profile *lprof;
+ int err = 0;
+
+ CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+ OBD_ALLOC(lprof, sizeof(*lprof));
+ if (lprof == NULL)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&lprof->lp_list);
+
+ LASSERT(proflen == (strlen(prof) + 1));
+ OBD_ALLOC(lprof->lp_profile, proflen);
+ if (lprof->lp_profile == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(lprof->lp_profile, prof, proflen);
+
+ LASSERT(osclen == (strlen(osc) + 1));
+ OBD_ALLOC(lprof->lp_dt, osclen);
+ if (lprof->lp_dt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(lprof->lp_dt, osc, osclen);
+
+ if (mdclen > 0) {
+ LASSERT(mdclen == (strlen(mdc) + 1));
+ OBD_ALLOC(lprof->lp_md, mdclen);
+ if (lprof->lp_md == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(lprof->lp_md, mdc, mdclen);
+ }
+
+ list_add(&lprof->lp_list, &lustre_profile_list);
+ return err;
+
+out:
+ if (lprof->lp_md)
+ OBD_FREE(lprof->lp_md, mdclen);
+ if (lprof->lp_dt)
+ OBD_FREE(lprof->lp_dt, osclen);
+ if (lprof->lp_profile)
+ OBD_FREE(lprof->lp_profile, proflen);
+ OBD_FREE(lprof, sizeof(*lprof));
+ return err;
+}
+
+void class_del_profile(const char *prof)
+{
+ struct lustre_profile *lprof;
+
+ CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+ lprof = class_get_profile(prof);
+ if (lprof) {
+ list_del(&lprof->lp_list);
+ OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+ OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+ if (lprof->lp_md)
+ OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+ OBD_FREE(lprof, sizeof(*lprof));
+ }
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+ struct lustre_profile *lprof, *n;
+
+ list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+ list_del(&lprof->lp_list);
+ OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+ OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+ if (lprof->lp_md)
+ OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+ OBD_FREE(lprof, sizeof(*lprof));
+ }
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+ if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+ at_min = val;
+ else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+ at_max = val;
+ else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+ at_extra = val;
+ else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+ at_early_margin = val;
+ else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+ at_history = val;
+ else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+ strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+ JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+ else
+ return -EINVAL;
+
+ CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+ return 0;
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+ client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer pointer to the newly-allocated config structure
+ * which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ * not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+ const char *new_name)
+{
+ struct lustre_cfg_bufs *bufs = NULL;
+ struct lustre_cfg *new_cfg = NULL;
+ char *param = NULL;
+ char *new_param = NULL;
+ char *value = NULL;
+ int name_len = 0;
+ int new_len = 0;
+
+ if (cfg == NULL || new_name == NULL)
+ return ERR_PTR(-EINVAL);
+
+ param = lustre_cfg_string(cfg, 1);
+ if (param == NULL)
+ return ERR_PTR(-EINVAL);
+
+ value = strchr(param, '=');
+ if (value == NULL)
+ name_len = strlen(param);
+ else
+ name_len = value - param;
+
+ new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+ OBD_ALLOC(new_param, new_len);
+ if (new_param == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ strcpy(new_param, new_name);
+ if (value != NULL)
+ strcat(new_param, value);
+
+ OBD_ALLOC_PTR(bufs);
+ if (bufs == NULL) {
+ OBD_FREE(new_param, new_len);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ lustre_cfg_bufs_reset(bufs, NULL);
+ lustre_cfg_bufs_init(bufs, cfg);
+ lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+ new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+ OBD_FREE(new_param, new_len);
+ OBD_FREE_PTR(bufs);
+ if (new_cfg == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ new_cfg->lcfg_num = cfg->lcfg_num;
+ new_cfg->lcfg_flags = cfg->lcfg_flags;
+ new_cfg->lcfg_nid = cfg->lcfg_nid;
+ new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+ return new_cfg;
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+static int process_param2_config(struct lustre_cfg *lcfg)
+{
+ char *param = lustre_cfg_string(lcfg, 1);
+ char *upcall = lustre_cfg_string(lcfg, 2);
+ char *argv[] = {
+ [0] = "/usr/sbin/lctl",
+ [1] = "set_param",
+ [2] = param,
+ [3] = NULL
+ };
+ struct timeval start;
+ struct timeval end;
+ int rc;
+
+
+ /* Add upcall processing here. Now only lctl is supported */
+ if (strcmp(upcall, LCTL_UPCALL) != 0) {
+ CERROR("Unsupported upcall %s\n", upcall);
+ return -EINVAL;
+ }
+
+ do_gettimeofday(&start);
+ rc = call_usermodehelper(argv[0], argv, NULL, 1);
+ do_gettimeofday(&end);
+
+ if (rc < 0) {
+ CERROR(
+ "lctl: error invoking upcall %s %s %s: rc = %d; time %ldus\n",
+ argv[0], argv[1], argv[2], rc,
+ cfs_timeval_sub(&end, &start, NULL));
+ } else {
+ CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n",
+ argv[0], argv[1], argv[2],
+ cfs_timeval_sub(&end, &start, NULL));
+ rc = 0;
+ }
+
+ return rc;
+}
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+ quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+ struct obd_device *obd;
+ int err;
+
+ LASSERT(lcfg && !IS_ERR(lcfg));
+ CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+ /* Commands that don't need a device */
+ switch (lcfg->lcfg_command) {
+ case LCFG_ATTACH: {
+ err = class_attach(lcfg);
+ goto out;
+ }
+ case LCFG_ADD_UUID: {
+ CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid %#llx (%s)\n",
+ lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid,
+ libcfs_nid2str(lcfg->lcfg_nid));
+
+ err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+ goto out;
+ }
+ case LCFG_DEL_UUID: {
+ CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+ (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+ ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+ err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+ goto out;
+ }
+ case LCFG_MOUNTOPT: {
+ CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+ lustre_cfg_string(lcfg, 1),
+ lustre_cfg_string(lcfg, 2),
+ lustre_cfg_string(lcfg, 3));
+ /* set these mount options somewhere, so ll_fill_super
+ * can find them. */
+ err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+ lustre_cfg_string(lcfg, 1),
+ LUSTRE_CFG_BUFLEN(lcfg, 2),
+ lustre_cfg_string(lcfg, 2),
+ LUSTRE_CFG_BUFLEN(lcfg, 3),
+ lustre_cfg_string(lcfg, 3));
+ goto out;
+ }
+ case LCFG_DEL_MOUNTOPT: {
+ CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+ lustre_cfg_string(lcfg, 1));
+ class_del_profile(lustre_cfg_string(lcfg, 1));
+ err = 0;
+ goto out;
+ }
+ case LCFG_SET_TIMEOUT: {
+ CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+ obd_timeout, lcfg->lcfg_num);
+ obd_timeout = max(lcfg->lcfg_num, 1U);
+ obd_timeout_set = 1;
+ err = 0;
+ goto out;
+ }
+ case LCFG_SET_LDLM_TIMEOUT: {
+ CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+ ldlm_timeout, lcfg->lcfg_num);
+ ldlm_timeout = max(lcfg->lcfg_num, 1U);
+ if (ldlm_timeout >= obd_timeout)
+ ldlm_timeout = max(obd_timeout / 3, 1U);
+ ldlm_timeout_set = 1;
+ err = 0;
+ goto out;
+ }
+ case LCFG_SET_UPCALL: {
+ LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+ /* COMPAT_146 Don't fail on old configs */
+ err = 0;
+ goto out;
+ }
+ case LCFG_MARKER: {
+ struct cfg_marker *marker;
+ marker = lustre_cfg_buf(lcfg, 1);
+ CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+ marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+ err = 0;
+ goto out;
+ }
+ case LCFG_PARAM: {
+ char *tmp;
+ /* llite has no obd */
+ if ((class_match_param(lustre_cfg_string(lcfg, 1),
+ PARAM_LLITE, NULL) == 0) &&
+ client_process_config) {
+ err = (*client_process_config)(lcfg);
+ goto out;
+ } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+ PARAM_SYS, &tmp) == 0)) {
+ /* Global param settings */
+ err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+ /*
+ * Client or server should not fail to mount if
+ * it hits an unknown configuration parameter.
+ */
+ if (err != 0)
+ CWARN("Ignoring unknown param %s\n", tmp);
+
+ err = 0;
+ goto out;
+ } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+ PARAM_QUOTA, &tmp) == 0) &&
+ quota_process_config) {
+ err = (*quota_process_config)(lcfg);
+ goto out;
+ }
+
+ break;
+ }
+ case LCFG_SET_PARAM: {
+ err = process_param2_config(lcfg);
+ goto out;
+ }
+ }
+ /* Commands that require a device */
+ obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+ if (obd == NULL) {
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+ CERROR("this lcfg command requires a device name\n");
+ else
+ CERROR("no device for: %s\n",
+ lustre_cfg_string(lcfg, 0));
+
+ err = -EINVAL;
+ goto out;
+ }
+
+ switch (lcfg->lcfg_command) {
+ case LCFG_SETUP: {
+ err = class_setup(obd, lcfg);
+ goto out;
+ }
+ case LCFG_DETACH: {
+ err = class_detach(obd, lcfg);
+ err = 0;
+ goto out;
+ }
+ case LCFG_CLEANUP: {
+ err = class_cleanup(obd, lcfg);
+ err = 0;
+ goto out;
+ }
+ case LCFG_ADD_CONN: {
+ err = class_add_conn(obd, lcfg);
+ err = 0;
+ goto out;
+ }
+ case LCFG_DEL_CONN: {
+ err = class_del_conn(obd, lcfg);
+ err = 0;
+ goto out;
+ }
+ case LCFG_POOL_NEW: {
+ err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+ err = 0;
+ goto out;
+ }
+ case LCFG_POOL_ADD: {
+ err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+ lustre_cfg_string(lcfg, 3));
+ err = 0;
+ goto out;
+ }
+ case LCFG_POOL_REM: {
+ err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+ lustre_cfg_string(lcfg, 3));
+ err = 0;
+ goto out;
+ }
+ case LCFG_POOL_DEL: {
+ err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+ err = 0;
+ goto out;
+ }
+ default: {
+ err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+ goto out;
+
+ }
+ }
+out:
+ if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+ CWARN("Ignoring error %d on optional command %#x\n", err,
+ lcfg->lcfg_command);
+ err = 0;
+ }
+ return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+ struct lustre_cfg *lcfg, void *data)
+{
+ struct lprocfs_vars *var;
+ struct file fakefile;
+ struct seq_file fake_seqfile;
+ char *key, *sval;
+ int i, keylen, vallen;
+ int matched = 0, j = 0;
+ int rc = 0;
+ int skip = 0;
+
+ if (lcfg->lcfg_command != LCFG_PARAM) {
+ CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+ return -EINVAL;
+ }
+
+ /* fake a seq file so that var->fops->write can work... */
+ fakefile.private_data = &fake_seqfile;
+ fake_seqfile.private = data;
+ /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+ or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+ or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+ for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+ key = lustre_cfg_buf(lcfg, i);
+ /* Strip off prefix */
+ class_match_param(key, prefix, &key);
+ sval = strchr(key, '=');
+ if (!sval || (*(sval + 1) == 0)) {
+ CERROR("Can't parse param %s (missing '=')\n", key);
+ /* rc = -EINVAL; continue parsing other params */
+ continue;
+ }
+ keylen = sval - key;
+ sval++;
+ vallen = strlen(sval);
+ matched = 0;
+ j = 0;
+ /* Search proc entries */
+ while (lvars[j].name) {
+ var = &lvars[j];
+ if (class_match_param(key, (char *)var->name, NULL) == 0
+ && keylen == strlen(var->name)) {
+ matched++;
+ rc = -EROFS;
+ if (var->fops && var->fops->write) {
+ mm_segment_t oldfs;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ rc = (var->fops->write)(&fakefile, sval,
+ vallen, NULL);
+ set_fs(oldfs);
+ }
+ break;
+ }
+ j++;
+ }
+ if (!matched) {
+ /* If the prefix doesn't match, return error so we
+ can pass it down the stack */
+ if (strnchr(key, keylen, '.'))
+ return -ENOSYS;
+ CERROR("%s: unknown param %s\n",
+ (char *)lustre_cfg_string(lcfg, 0), key);
+ /* rc = -EINVAL; continue parsing other params */
+ skip++;
+ } else if (rc < 0) {
+ CERROR("writing proc entry %s err %d\n",
+ var->name, rc);
+ rc = 0;
+ } else {
+ CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+ lustre_cfg_string(lcfg, 0),
+ (int)strlen(prefix) - 1, prefix,
+ (int)(sval - key - 1), key, sval);
+ }
+ }
+
+ if (rc > 0)
+ rc = 0;
+ if (!rc && skip)
+ rc = skip;
+ return rc;
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct config_llog_instance *clli = data;
+ int cfg_len = rec->lrh_len;
+ char *cfg_buf = (char *) (rec + 1);
+ int rc = 0;
+
+ //class_config_dump_handler(handle, rec, data);
+
+ switch (rec->lrh_type) {
+ case OBD_CFG_REC: {
+ struct lustre_cfg *lcfg, *lcfg_new;
+ struct lustre_cfg_bufs bufs;
+ char *inst_name = NULL;
+ int inst_len = 0;
+ int inst = 0, swab = 0;
+
+ lcfg = (struct lustre_cfg *)cfg_buf;
+ if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+ lustre_swab_lustre_cfg(lcfg);
+ swab = 1;
+ }
+
+ rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+ if (rc)
+ goto out;
+
+ /* Figure out config state info */
+ if (lcfg->lcfg_command == LCFG_MARKER) {
+ struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+ lustre_swab_cfg_marker(marker, swab,
+ LUSTRE_CFG_BUFLEN(lcfg, 1));
+ CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+ clli->cfg_flags, marker->cm_flags);
+ if (marker->cm_flags & CM_START) {
+ /* all previous flags off */
+ clli->cfg_flags = CFG_F_MARKER;
+ if (marker->cm_flags & CM_SKIP) {
+ clli->cfg_flags |= CFG_F_SKIP;
+ CDEBUG(D_CONFIG, "SKIP #%d\n",
+ marker->cm_step);
+ } else if ((marker->cm_flags & CM_EXCLUDE) ||
+ (clli->cfg_sb &&
+ lustre_check_exclusion(clli->cfg_sb,
+ marker->cm_tgtname))) {
+ clli->cfg_flags |= CFG_F_EXCLUDE;
+ CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+ marker->cm_step);
+ }
+ } else if (marker->cm_flags & CM_END) {
+ clli->cfg_flags = 0;
+ }
+ }
+ /* A config command without a start marker before it is
+ illegal (post 146) */
+ if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+ !(clli->cfg_flags & CFG_F_MARKER) &&
+ (lcfg->lcfg_command != LCFG_MARKER)) {
+ CWARN("Config not inside markers, ignoring! (inst: %p, uuid: %s, flags: %#x)\n",
+ clli->cfg_instance,
+ clli->cfg_uuid.uuid, clli->cfg_flags);
+ clli->cfg_flags |= CFG_F_SKIP;
+ }
+ if (clli->cfg_flags & CFG_F_SKIP) {
+ CDEBUG(D_CONFIG, "skipping %#x\n",
+ clli->cfg_flags);
+ rc = 0;
+ /* No processing! */
+ break;
+ }
+
+ /*
+ * For interoperability between 1.8 and 2.0,
+ * rename "mds" obd device type to "mdt".
+ */
+ {
+ char *typename = lustre_cfg_string(lcfg, 1);
+ char *index = lustre_cfg_string(lcfg, 2);
+
+ if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+ strcmp(typename, "mds") == 0)) {
+ CWARN("For 1.8 interoperability, rename obd type from mds to mdt\n");
+ typename[2] = 't';
+ }
+ if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+ strcmp(index, "type") == 0)) {
+ CDEBUG(D_INFO, "For 1.8 interoperability, set this index to '0'\n");
+ index[0] = '0';
+ index[1] = 0;
+ }
+ }
+
+
+ if (clli->cfg_flags & CFG_F_EXCLUDE) {
+ CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n",
+ lcfg->lcfg_command);
+ if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)
+ /* Add inactive instead */
+ lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+ }
+
+ lustre_cfg_bufs_init(&bufs, lcfg);
+
+ if (clli && clli->cfg_instance &&
+ LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+ inst = 1;
+ inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+ sizeof(clli->cfg_instance) * 2 + 4;
+ OBD_ALLOC(inst_name, inst_len);
+ if (inst_name == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ sprintf(inst_name, "%s-%p",
+ lustre_cfg_string(lcfg, 0),
+ clli->cfg_instance);
+ lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+ CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+ lcfg->lcfg_command, inst_name);
+ }
+
+ /* we override the llog's uuid for clients, to insure they
+ are unique */
+ if (clli && clli->cfg_instance != NULL &&
+ lcfg->lcfg_command == LCFG_ATTACH) {
+ lustre_cfg_bufs_set_string(&bufs, 2,
+ clli->cfg_uuid.uuid);
+ }
+ /*
+ * sptlrpc config record, we expect 2 data segments:
+ * [0]: fs_name/target_name,
+ * [1]: rule string
+ * moving them to index [1] and [2], and insert MGC's
+ * obdname at index [0].
+ */
+ if (clli && clli->cfg_instance == NULL &&
+ lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+ lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+ bufs.lcfg_buflen[1]);
+ lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+ bufs.lcfg_buflen[0]);
+ lustre_cfg_bufs_set_string(&bufs, 0,
+ clli->cfg_obdname);
+ }
+
+ lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+ lcfg_new->lcfg_num = lcfg->lcfg_num;
+ lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+ /* XXX Hack to try to remain binary compatible with
+ * pre-newconfig logs */
+ if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */
+ (lcfg->lcfg_nid >> 32) == 0) {
+ __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+ lcfg_new->lcfg_nid =
+ LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+ CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+ lcfg->lcfg_nal, addr,
+ libcfs_nid2str(lcfg_new->lcfg_nid));
+ } else {
+ lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+ }
+
+ lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+ rc = class_process_config(lcfg_new);
+ lustre_cfg_free(lcfg_new);
+
+ if (inst)
+ OBD_FREE(inst_name, inst_len);
+ break;
+ }
+ default:
+ CERROR("Unknown llog record type %#x encountered\n",
+ rec->lrh_type);
+ break;
+ }
+out:
+ if (rc) {
+ CERROR("%s: cfg command failed: rc = %d\n",
+ handle->lgh_ctxt->loc_obd->obd_name, rc);
+ class_config_dump_handler(NULL, handle, rec, data);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg)
+{
+ struct llog_process_cat_data cd = {0, 0};
+ struct llog_handle *llh;
+ llog_cb_t callback;
+ int rc;
+
+ CDEBUG(D_INFO, "looking up llog %s\n", name);
+ rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc)
+ return rc;
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ goto parse_out;
+
+ /* continue processing from where we last stopped to end-of-log */
+ if (cfg) {
+ cd.lpcd_first_idx = cfg->cfg_last_idx;
+ callback = cfg->cfg_callback;
+ LASSERT(callback != NULL);
+ } else {
+ callback = class_config_llog_handler;
+ }
+
+ cd.lpcd_last_idx = 0;
+
+ rc = llog_process(env, llh, callback, cfg, &cd);
+
+ CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+ cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+ if (cfg)
+ cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+ llog_close(env, llh);
+ return rc;
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+ struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1);
+ char *ptr = buf;
+ char *end = buf + size;
+ int rc = 0;
+
+ LASSERT(rec->lrh_type == OBD_CFG_REC);
+ rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+ if (rc < 0)
+ return rc;
+
+ ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+ if (lcfg->lcfg_flags)
+ ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+ lcfg->lcfg_flags);
+
+ if (lcfg->lcfg_num)
+ ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+ if (lcfg->lcfg_nid)
+ ptr += snprintf(ptr, end-ptr, "nid=%s(%#llx)\n ",
+ libcfs_nid2str(lcfg->lcfg_nid),
+ lcfg->lcfg_nid);
+
+ if (lcfg->lcfg_command == LCFG_MARKER) {
+ struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+ ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+ marker->cm_step, marker->cm_flags,
+ marker->cm_tgtname, marker->cm_comment);
+ } else {
+ int i;
+
+ for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+ ptr += snprintf(ptr, end-ptr, "%d:%s ", i,
+ lustre_cfg_string(lcfg, i));
+ }
+ }
+ /* return consumed bytes */
+ rc = ptr - buf;
+ return rc;
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ char *outstr;
+ int rc = 0;
+
+ OBD_ALLOC(outstr, 256);
+ if (outstr == NULL)
+ return -ENOMEM;
+
+ if (rec->lrh_type == OBD_CFG_REC) {
+ class_config_parse_rec(rec, outstr, 256);
+ LCONSOLE(D_WARNING, " %s\n", outstr);
+ } else {
+ LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+ rc = -EINVAL;
+ }
+
+ OBD_FREE(outstr, 256);
+ return rc;
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg)
+{
+ struct llog_handle *llh;
+ int rc;
+
+ LCONSOLE_INFO("Dumping config log %s\n", name);
+
+ rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc)
+ return rc;
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ goto parse_out;
+
+ rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+ llog_close(env, llh);
+
+ LCONSOLE_INFO("End config log %s\n", name);
+ return rc;
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+ char flags[3] = "";
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs bufs;
+ int rc;
+
+ if (!obd) {
+ CERROR("empty cleanup\n");
+ return -EALREADY;
+ }
+
+ if (obd->obd_force)
+ strcat(flags, "F");
+ if (obd->obd_fail)
+ strcat(flags, "A");
+
+ CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+ obd->obd_name, flags);
+
+ lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+ lustre_cfg_bufs_set_string(&bufs, 1, flags);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+ if (!lcfg)
+ return -ENOMEM;
+
+ rc = class_process_config(lcfg);
+ if (rc) {
+ CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+ goto out;
+ }
+
+ /* the lcfg is almost the same for both ops */
+ lcfg->lcfg_command = LCFG_DETACH;
+ rc = class_process_config(lcfg);
+ if (rc)
+ CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+ lustre_cfg_free(lcfg);
+ return rc;
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+ sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+ return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ * state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ LASSERT(key);
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+ return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+ !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+ class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+ class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+ .hs_hash = uuid_hash,
+ .hs_key = uuid_key,
+ .hs_keycmp = uuid_keycmp,
+ .hs_object = uuid_export_object,
+ .hs_get = uuid_export_get,
+ .hs_put_locked = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+ return &exp->exp_connection->c_peer.nid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ * state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ LASSERT(key);
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+ return exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+ !exp->exp_failed;
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+ class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+ class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+ .hs_hash = nid_hash,
+ .hs_key = nid_key,
+ .hs_keycmp = nid_kepcmp,
+ .hs_object = nid_export_object,
+ .hs_get = nid_export_get,
+ .hs_put_locked = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+ struct nid_stat *ns;
+
+ ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+ return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+ return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct nid_stat *ns;
+
+ ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+ nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct nid_stat *ns;
+
+ ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+ nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+ .hs_hash = nid_hash,
+ .hs_key = nidstats_key,
+ .hs_keycmp = nidstats_keycmp,
+ .hs_object = nidstats_object,
+ .hs_get = nidstats_get,
+ .hs_put_locked = nidstats_put_locked,
+};
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644
index 000000000..3437b2ecf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
@@ -0,0 +1,1319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include "../include/obd.h"
+#include "../include/linux/lustre_compat25.h"
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_user.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_disk.h"
+#include "../include/lustre_param.h"
+
+static int (*client_fill_super)(struct super_block *sb,
+ struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ * the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ * this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg)
+{
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs *bufs;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *mgc = lsi->lsi_mgc;
+ int rc;
+
+ LASSERT(mgc);
+ LASSERT(cfg);
+
+ OBD_ALLOC_PTR(bufs);
+ if (bufs == NULL)
+ return -ENOMEM;
+
+ /* mgc_process_config */
+ lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+ lustre_cfg_bufs_set_string(bufs, 1, logname);
+ lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+ lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+ lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+ rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+ lustre_cfg_free(lcfg);
+
+ OBD_FREE_PTR(bufs);
+
+ if (rc == -EINVAL)
+ LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s' failed from the MGS (%d). Make sure this client and the MGS are running compatible versions of Lustre.\n",
+ mgc->obd_name, logname, rc);
+
+ if (rc)
+ LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' failed (%d). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.\n",
+ mgc->obd_name, logname,
+ rc);
+
+ /* class_obd_list(); */
+ return rc;
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg)
+{
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *mgc = lsi->lsi_mgc;
+ int rc;
+
+ if (!mgc)
+ return -ENOENT;
+
+ /* mgc_process_config */
+ lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+ lustre_cfg_bufs_set_string(&bufs, 1, logname);
+ if (cfg)
+ lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+ lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+ rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+ lustre_cfg_free(lcfg);
+ return rc;
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+ char *s1, char *s2, char *s3, char *s4)
+{
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg * lcfg = NULL;
+ int rc;
+
+ CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+ cmd, s1, s2, s3, s4);
+
+ lustre_cfg_bufs_reset(&bufs, cfgname);
+ if (s1)
+ lustre_cfg_bufs_set_string(&bufs, 1, s1);
+ if (s2)
+ lustre_cfg_bufs_set_string(&bufs, 2, s2);
+ if (s3)
+ lustre_cfg_bufs_set_string(&bufs, 3, s3);
+ if (s4)
+ lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+ lcfg = lustre_cfg_new(cmd, &bufs);
+ lcfg->lcfg_nid = nid;
+ rc = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
+ return rc;
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup. These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+ char *s1, char *s2, char *s3, char *s4)
+{
+ int rc;
+ CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+ rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL);
+ if (rc) {
+ CERROR("%s attach error %d\n", obdname, rc);
+ return rc;
+ }
+ rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+ if (rc) {
+ CERROR("%s setup error %d\n", obdname, rc);
+ do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL);
+ }
+ return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+ struct obd_connect_data *data = NULL;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *obd;
+ struct obd_export *exp;
+ struct obd_uuid *uuid;
+ class_uuid_t uuidc;
+ lnet_nid_t nid;
+ char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+ char *ptr;
+ int rc = 0, i = 0, j, len;
+
+ LASSERT(lsi->lsi_lmd);
+
+ /* Find the first non-lo MGS nid for our MGC name */
+ if (IS_SERVER(lsi)) {
+ /* mount -o mgsnode=nid */
+ ptr = lsi->lsi_lmd->lmd_mgs;
+ if (lsi->lsi_lmd->lmd_mgs &&
+ (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+ i++;
+ } else if (IS_MGS(lsi)) {
+ lnet_process_id_t id;
+ while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+ if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+ continue;
+ nid = id.nid;
+ i++;
+ break;
+ }
+ }
+ } else { /* client */
+ /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+ ptr = lsi->lsi_lmd->lmd_dev;
+ if (class_parse_nid(ptr, &nid, &ptr) == 0)
+ i++;
+ }
+ if (i == 0) {
+ CERROR("No valid MGS nids found.\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&mgc_start_lock);
+
+ len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+ OBD_ALLOC(mgcname, len);
+ OBD_ALLOC(niduuid, len + 2);
+ if (!mgcname || !niduuid) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+ sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+ mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+ OBD_ALLOC_PTR(data);
+ if (data == NULL) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ obd = class_name2obd(mgcname);
+ if (obd && !obd->obd_stopping) {
+ int recov_bk;
+
+ rc = obd_set_info_async(NULL, obd->obd_self_export,
+ strlen(KEY_MGSSEC), KEY_MGSSEC,
+ strlen(mgssec), mgssec, NULL);
+ if (rc)
+ goto out_free;
+
+ /* Re-using an existing MGC */
+ atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+ /* IR compatibility check, only for clients */
+ if (lmd_is_client(lsi->lsi_lmd)) {
+ int has_ir;
+ int vallen = sizeof(*data);
+ __u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+ rc = obd_get_info(NULL, obd->obd_self_export,
+ strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+ &vallen, data, NULL);
+ LASSERT(rc == 0);
+ has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+ if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+ /* LMD_FLG_NOIR is for test purpose only */
+ LCONSOLE_WARN(
+ "Trying to mount a client with IR setting not compatible with current mgc. Force to use current mgc setting that is IR %s.\n",
+ has_ir ? "enabled" : "disabled");
+ if (has_ir)
+ *flags &= ~LMD_FLG_NOIR;
+ else
+ *flags |= LMD_FLG_NOIR;
+ }
+ }
+
+ recov_bk = 0;
+ /* If we are restarting the MGS, don't try to keep the MGC's
+ old connection, or registration will fail. */
+ if (IS_MGS(lsi)) {
+ CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+ recov_bk = 1;
+ }
+
+ /* Try all connections, but only once (again).
+ We don't want to block another target from starting
+ (using its local copy of the log), but we do want to connect
+ if at all possible. */
+ recov_bk++;
+ CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,
+ recov_bk);
+ rc = obd_set_info_async(NULL, obd->obd_self_export,
+ sizeof(KEY_INIT_RECOV_BACKUP),
+ KEY_INIT_RECOV_BACKUP,
+ sizeof(recov_bk), &recov_bk, NULL);
+ rc = 0;
+ goto out;
+ }
+
+ CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+ /* Add the primary nids for the MGS */
+ i = 0;
+ sprintf(niduuid, "%s_%x", mgcname, i);
+ if (IS_SERVER(lsi)) {
+ ptr = lsi->lsi_lmd->lmd_mgs;
+ if (IS_MGS(lsi)) {
+ /* Use local nids (including LO) */
+ lnet_process_id_t id;
+ while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+ rc = do_lcfg(mgcname, id.nid,
+ LCFG_ADD_UUID, niduuid,
+ NULL, NULL, NULL);
+ }
+ } else {
+ /* Use mgsnode= nids */
+ /* mount -o mgsnode=nid */
+ if (lsi->lsi_lmd->lmd_mgs) {
+ ptr = lsi->lsi_lmd->lmd_mgs;
+ } else if (class_find_param(ptr, PARAM_MGSNODE,
+ &ptr) != 0) {
+ CERROR("No MGS nids given.\n");
+ rc = -EINVAL;
+ goto out_free;
+ }
+ while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+ rc = do_lcfg(mgcname, nid,
+ LCFG_ADD_UUID, niduuid,
+ NULL, NULL, NULL);
+ i++;
+ }
+ }
+ } else { /* client */
+ /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+ ptr = lsi->lsi_lmd->lmd_dev;
+ while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+ rc = do_lcfg(mgcname, nid,
+ LCFG_ADD_UUID, niduuid, NULL, NULL, NULL);
+ i++;
+ /* Stop at the first failover nid */
+ if (*ptr == ':')
+ break;
+ }
+ }
+ if (i == 0) {
+ CERROR("No valid MGS nids found.\n");
+ rc = -EINVAL;
+ goto out_free;
+ }
+ lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+ /* Random uuid for MGC allows easier reconnects */
+ OBD_ALLOC_PTR(uuid);
+ if (!uuid) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ ll_generate_random_uuid(uuidc);
+ class_uuid_unparse(uuidc, uuid);
+
+ /* Start the MGC */
+ rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+ (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+ niduuid, NULL, NULL);
+ OBD_FREE_PTR(uuid);
+ if (rc)
+ goto out_free;
+
+ /* Add any failover MGS nids */
+ i = 1;
+ while (ptr && ((*ptr == ':' ||
+ class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+ /* New failover node */
+ sprintf(niduuid, "%s_%x", mgcname, i);
+ j = 0;
+ while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+ j++;
+ rc = do_lcfg(mgcname, nid,
+ LCFG_ADD_UUID, niduuid, NULL, NULL, NULL);
+ if (*ptr == ':')
+ break;
+ }
+ if (j > 0) {
+ rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+ niduuid, NULL, NULL, NULL);
+ i++;
+ } else {
+ /* at ":/fsname" */
+ break;
+ }
+ }
+ lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+ obd = class_name2obd(mgcname);
+ if (!obd) {
+ CERROR("Can't find mgcobd %s\n", mgcname);
+ rc = -ENOTCONN;
+ goto out_free;
+ }
+
+ rc = obd_set_info_async(NULL, obd->obd_self_export,
+ strlen(KEY_MGSSEC), KEY_MGSSEC,
+ strlen(mgssec), mgssec, NULL);
+ if (rc)
+ goto out_free;
+
+ /* Keep a refcount of servers/clients who started with "mount",
+ so we know when we can get rid of the mgc. */
+ atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+ /* We connect to the MGS at setup, and don't disconnect until cleanup */
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+ OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+ data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+ if (lmd_is_client(lsi->lsi_lmd) &&
+ lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+ data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+ data->ocd_version = LUSTRE_VERSION_CODE;
+ rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+ if (rc) {
+ CERROR("connect failed %d\n", rc);
+ goto out;
+ }
+
+ obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+ /* Keep the mgc info in the sb. Note that many lsi's can point
+ to the same mgc.*/
+ lsi->lsi_mgc = obd;
+out_free:
+ mutex_unlock(&mgc_start_lock);
+
+ if (data)
+ OBD_FREE_PTR(data);
+ if (mgcname)
+ OBD_FREE(mgcname, len);
+ if (niduuid)
+ OBD_FREE(niduuid, len + 2);
+ return rc;
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *obd;
+ char *niduuid = NULL, *ptr = NULL;
+ int i, rc = 0, len = 0;
+
+ if (!lsi)
+ return -ENOENT;
+ obd = lsi->lsi_mgc;
+ if (!obd)
+ return -ENOENT;
+ lsi->lsi_mgc = NULL;
+
+ mutex_lock(&mgc_start_lock);
+ LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+ if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+ /* This is not fatal, every client that stops
+ will call in here. */
+ CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+ atomic_read(&obd->u.cli.cl_mgc_refcount));
+ rc = -EBUSY;
+ goto out;
+ }
+
+ /* The MGC has no recoverable data in any case.
+ * force shutdown set in umount_begin */
+ obd->obd_no_recov = 1;
+
+ if (obd->u.cli.cl_mgc_mgsexp) {
+ /* An error is not fatal, if we are unable to send the
+ disconnect mgs ping evictor cleans up the export */
+ rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+ if (rc)
+ CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+ }
+
+ /* Save the obdname for cleaning the nid uuids, which are
+ obdname_XX */
+ len = strlen(obd->obd_name) + 6;
+ OBD_ALLOC(niduuid, len);
+ if (niduuid) {
+ strcpy(niduuid, obd->obd_name);
+ ptr = niduuid + strlen(niduuid);
+ }
+
+ rc = class_manual_cleanup(obd);
+ if (rc)
+ goto out;
+
+ /* Clean the nid uuids */
+ if (!niduuid) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+ sprintf(ptr, "_%x", i);
+ rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+ niduuid, NULL, NULL, NULL);
+ if (rc)
+ CERROR("del MDC UUID %s failed: rc = %d\n",
+ niduuid, rc);
+ }
+out:
+ if (niduuid)
+ OBD_FREE(niduuid, len);
+
+ /* class_import_put will get rid of the additional connections */
+ mutex_unlock(&mgc_start_lock);
+ return rc;
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi;
+
+ OBD_ALLOC_PTR(lsi);
+ if (!lsi)
+ return NULL;
+ OBD_ALLOC_PTR(lsi->lsi_lmd);
+ if (!lsi->lsi_lmd) {
+ OBD_FREE_PTR(lsi);
+ return NULL;
+ }
+
+ lsi->lsi_lmd->lmd_exclude_count = 0;
+ lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+ lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+ s2lsi_nocast(sb) = lsi;
+ /* we take 1 extra ref for our setup */
+ atomic_set(&lsi->lsi_mounts, 1);
+
+ /* Default umount style */
+ lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+ return lsi;
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+
+ LASSERT(lsi != NULL);
+ CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+ /* someone didn't call server_put_mount. */
+ LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+ if (lsi->lsi_lmd != NULL) {
+ if (lsi->lsi_lmd->lmd_dev != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_dev,
+ strlen(lsi->lsi_lmd->lmd_dev) + 1);
+ if (lsi->lsi_lmd->lmd_profile != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_profile,
+ strlen(lsi->lsi_lmd->lmd_profile) + 1);
+ if (lsi->lsi_lmd->lmd_mgssec != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+ strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+ if (lsi->lsi_lmd->lmd_opts != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_opts,
+ strlen(lsi->lsi_lmd->lmd_opts) + 1);
+ if (lsi->lsi_lmd->lmd_exclude_count)
+ OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+ sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+ lsi->lsi_lmd->lmd_exclude_count);
+ if (lsi->lsi_lmd->lmd_mgs != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+ strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+ if (lsi->lsi_lmd->lmd_osd_type != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+ strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+ if (lsi->lsi_lmd->lmd_params != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+ OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+ }
+
+ LASSERT(lsi->lsi_llsbi == NULL);
+ OBD_FREE(lsi, sizeof(*lsi));
+ s2lsi_nocast(sb) = NULL;
+
+ return 0;
+}
+
+/* The lsi has one reference for every server that is using the disk -
+ e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+
+ LASSERT(lsi != NULL);
+
+ CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+ if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+ if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+ lu_device_put(&lsi->lsi_dt_dev->dd_lu_dev);
+ lsi->lsi_osd_exp->exp_obd->obd_lvfs_ctxt.dt = NULL;
+ lsi->lsi_dt_dev = NULL;
+ obd_disconnect(lsi->lsi_osd_exp);
+ /* wait till OSD is gone */
+ obd_zombie_barrier();
+ }
+ lustre_free_lsi(sb);
+ return 1;
+ }
+ return 0;
+}
+
+/*** SERVER NAME ***
+ * <FSNAME><SEPARATOR><TYPE><INDEX>
+ * FSNAME is between 1 and 8 characters (inclusive).
+ * Excluded characters are '/' and ':'
+ * SEPARATOR is either ':' or '-'
+ * TYPE: "OST", "MDT", etc.
+ * INDEX: Hex representation of the index
+ */
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ * Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0 on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+ const char *dash;
+
+ dash = svname + strnlen(svname, 8); /* max fsname length is 8 */
+ for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+ ;
+ if (dash == svname)
+ return -EINVAL;
+
+ if (fsname != NULL) {
+ strncpy(fsname, svname, dash - svname);
+ fsname[dash - svname] = '\0';
+ }
+
+ if (endptr != NULL)
+ *endptr = dash;
+
+ return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+ size_t svsize)
+{
+ int rc;
+ const char *dash;
+
+ /* We use server_name2fsname() just for parsing */
+ rc = server_name2fsname(label, NULL, &dash);
+ if (rc != 0)
+ return rc;
+
+ if (endptr != NULL)
+ *endptr = dash;
+
+ if (strlcpy(svname, dash + 1, svsize) >= svsize)
+ return -E2BIG;
+
+ return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+ rc = server type, or
+ rc < 0 on error
+ if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+ unsigned long index;
+ int rc;
+ const char *dash;
+
+ /* We use server_name2fsname() just for parsing */
+ rc = server_name2fsname(svname, NULL, &dash);
+ if (rc != 0)
+ return rc;
+
+ dash++;
+
+ if (strncmp(dash, "MDT", 3) == 0)
+ rc = LDD_F_SV_TYPE_MDT;
+ else if (strncmp(dash, "OST", 3) == 0)
+ rc = LDD_F_SV_TYPE_OST;
+ else
+ return -EINVAL;
+
+ dash += 3;
+
+ if (strncmp(dash, "all", 3) == 0) {
+ if (endptr != NULL)
+ *endptr = dash + 3;
+ return rc | LDD_F_SV_ALL;
+ }
+
+ index = simple_strtoul(dash, (char **)endptr, 16);
+ if (idx != NULL)
+ *idx = index;
+
+ /* Account for -mdc after index that is possible when specifying mdt */
+ if (endptr != NULL && strncmp(LUSTRE_MDC_NAME, *endptr + 1,
+ sizeof(LUSTRE_MDC_NAME)-1) == 0)
+ *endptr += sizeof(LUSTRE_MDC_NAME);
+
+ return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common between server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+ int rc;
+
+ CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+ /* Drop a ref to the MGC */
+ rc = lustre_stop_mgc(sb);
+ if (rc && (rc != -ENOENT)) {
+ if (rc != -EBUSY) {
+ CERROR("Can't stop MGC: %d\n", rc);
+ return rc;
+ }
+ /* BUSY just means that there's some other obd that
+ needs the mgc. Let him clean it up. */
+ CDEBUG(D_MOUNT, "MGC still in use\n");
+ }
+ /* Drop a ref to the mounted disk */
+ lustre_put_lsi(sb);
+ lu_types_stop();
+ return rc;
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+ int i;
+
+ PRINT_CMD(D_MOUNT, " mount data:\n");
+ if (lmd_is_client(lmd))
+ PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+ PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev);
+ PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags);
+
+ if (lmd->lmd_opts)
+ PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+ if (lmd->lmd_recovery_time_soft)
+ PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+ lmd->lmd_recovery_time_soft);
+
+ if (lmd->lmd_recovery_time_hard)
+ PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+ lmd->lmd_recovery_time_hard);
+
+ for (i = 0; i < lmd->lmd_exclude_count; i++) {
+ PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i,
+ lmd->lmd_exclude[i]);
+ }
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct lustre_mount_data *lmd = lsi->lsi_lmd;
+ __u32 index;
+ int i, rc;
+
+ rc = server_name2index(svname, &index, NULL);
+ if (rc != LDD_F_SV_TYPE_OST)
+ /* Only exclude OSTs */
+ return 0;
+
+ CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+ index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+ for (i = 0; i < lmd->lmd_exclude_count; i++) {
+ if (index == lmd->lmd_exclude[i]) {
+ CWARN("Excluding %s (on exclusion list)\n", svname);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+ const char *s1 = ptr, *s2;
+ __u32 index, *exclude_list;
+ int rc = 0, devmax;
+
+ /* The shortest an ost name can be is 8 chars: -OST0000.
+ We don't actually know the fsname at this time, so in fact
+ a user could specify any fsname. */
+ devmax = strlen(ptr) / 8 + 1;
+
+ /* temp storage until we figure out how many we have */
+ OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+ if (!exclude_list)
+ return -ENOMEM;
+
+ /* we enter this fn pointing at the '=' */
+ while (*s1 && *s1 != ' ' && *s1 != ',') {
+ s1++;
+ rc = server_name2index(s1, &index, &s2);
+ if (rc < 0) {
+ CERROR("Can't parse server name '%s': rc = %d\n",
+ s1, rc);
+ break;
+ }
+ if (rc == LDD_F_SV_TYPE_OST)
+ exclude_list[lmd->lmd_exclude_count++] = index;
+ else
+ CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n",
+ (uint)(s2-s1), s1, rc);
+ s1 = s2;
+ /* now we are pointing at ':' (next exclude)
+ or ',' (end of excludes) */
+ if (lmd->lmd_exclude_count >= devmax)
+ break;
+ }
+ if (rc >= 0) /* non-err */
+ rc = 0;
+
+ if (lmd->lmd_exclude_count) {
+ /* permanent, freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+ lmd->lmd_exclude_count);
+ if (lmd->lmd_exclude) {
+ memcpy(lmd->lmd_exclude, exclude_list,
+ sizeof(index) * lmd->lmd_exclude_count);
+ } else {
+ rc = -ENOMEM;
+ lmd->lmd_exclude_count = 0;
+ }
+ }
+ OBD_FREE(exclude_list, sizeof(index) * devmax);
+ return rc;
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+ char *tail;
+ int length;
+
+ if (lmd->lmd_mgssec != NULL) {
+ OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+ lmd->lmd_mgssec = NULL;
+ }
+
+ tail = strchr(ptr, ',');
+ if (tail == NULL)
+ length = strlen(ptr);
+ else
+ length = tail - ptr;
+
+ OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+ if (lmd->lmd_mgssec == NULL)
+ return -ENOMEM;
+
+ memcpy(lmd->lmd_mgssec, ptr, length);
+ lmd->lmd_mgssec[length] = '\0';
+ return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+ char *tail;
+ int length;
+
+ if ((handle == NULL) || (ptr == NULL))
+ return -EINVAL;
+
+ if (*handle != NULL) {
+ OBD_FREE(*handle, strlen(*handle) + 1);
+ *handle = NULL;
+ }
+
+ tail = strchr(ptr, ',');
+ if (tail == NULL)
+ length = strlen(ptr);
+ else
+ length = tail - ptr;
+
+ OBD_ALLOC(*handle, length + 1);
+ if (*handle == NULL)
+ return -ENOMEM;
+
+ memcpy(*handle, ptr, length);
+ (*handle)[length] = '\0';
+
+ return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+ lnet_nid_t nid;
+ char *tail = *ptr;
+ char *mgsnid;
+ int length;
+ int oldlen = 0;
+
+ /* Find end of nidlist */
+ while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+ length = tail - *ptr;
+ if (length == 0) {
+ LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+ return -EINVAL;
+ }
+
+ if (lmd->lmd_mgs != NULL)
+ oldlen = strlen(lmd->lmd_mgs) + 1;
+
+ OBD_ALLOC(mgsnid, oldlen + length + 1);
+ if (mgsnid == NULL)
+ return -ENOMEM;
+
+ if (lmd->lmd_mgs != NULL) {
+ /* Multiple mgsnid= are taken to mean failover locations */
+ memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+ mgsnid[oldlen - 1] = ':';
+ OBD_FREE(lmd->lmd_mgs, oldlen);
+ }
+ memcpy(mgsnid + oldlen, *ptr, length);
+ mgsnid[oldlen + length] = '\0';
+ lmd->lmd_mgs = mgsnid;
+ *ptr = tail;
+
+ return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+ char *s1, *s2, *devname = NULL;
+ struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+ int rc = 0;
+
+ LASSERT(lmd);
+ if (!options) {
+ LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that /sbin/mount.lustre is installed.\n");
+ return -EINVAL;
+ }
+
+ /* Options should be a string - try to detect old lmd data */
+ if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+ LCONSOLE_ERROR_MSG(0x163, "You're using an old version of /sbin/mount.lustre. Please install version %s\n",
+ LUSTRE_VERSION_STRING);
+ return -EINVAL;
+ }
+ lmd->lmd_magic = LMD_MAGIC;
+
+ OBD_ALLOC(lmd->lmd_params, 4096);
+ if (lmd->lmd_params == NULL)
+ return -ENOMEM;
+ lmd->lmd_params[0] = '\0';
+
+ /* Set default flags here */
+
+ s1 = options;
+ while (*s1) {
+ int clear = 0;
+ int time_min = OBD_RECOVERY_TIME_MIN;
+
+ /* Skip whitespace and extra commas */
+ while (*s1 == ' ' || *s1 == ',')
+ s1++;
+
+ /* Client options are parsed in ll_options: eg. flock,
+ user_xattr, acl */
+
+ /* Parse non-ldiskfs options here. Rather than modifying
+ ldiskfs, we just zero these out here */
+ if (strncmp(s1, "abort_recov", 11) == 0) {
+ lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+ clear++;
+ } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+ lmd->lmd_recovery_time_soft = max_t(int,
+ simple_strtoul(s1 + 19, NULL, 10), time_min);
+ clear++;
+ } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+ lmd->lmd_recovery_time_hard = max_t(int,
+ simple_strtoul(s1 + 19, NULL, 10), time_min);
+ clear++;
+ } else if (strncmp(s1, "noir", 4) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+ clear++;
+ } else if (strncmp(s1, "nosvc", 5) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOSVC;
+ clear++;
+ } else if (strncmp(s1, "nomgs", 5) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOMGS;
+ clear++;
+ } else if (strncmp(s1, "noscrub", 7) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+ clear++;
+ } else if (strncmp(s1, PARAM_MGSNODE,
+ sizeof(PARAM_MGSNODE) - 1) == 0) {
+ s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+ /* Assume the next mount opt is the first
+ invalid nid we get to. */
+ rc = lmd_parse_mgs(lmd, &s2);
+ if (rc)
+ goto invalid;
+ clear++;
+ } else if (strncmp(s1, "writeconf", 9) == 0) {
+ lmd->lmd_flags |= LMD_FLG_WRITECONF;
+ clear++;
+ } else if (strncmp(s1, "update", 6) == 0) {
+ lmd->lmd_flags |= LMD_FLG_UPDATE;
+ clear++;
+ } else if (strncmp(s1, "virgin", 6) == 0) {
+ lmd->lmd_flags |= LMD_FLG_VIRGIN;
+ clear++;
+ } else if (strncmp(s1, "noprimnode", 10) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+ clear++;
+ } else if (strncmp(s1, "mgssec=", 7) == 0) {
+ rc = lmd_parse_mgssec(lmd, s1 + 7);
+ if (rc)
+ goto invalid;
+ clear++;
+ /* ost exclusion list */
+ } else if (strncmp(s1, "exclude=", 8) == 0) {
+ rc = lmd_make_exclusion(lmd, s1 + 7);
+ if (rc)
+ goto invalid;
+ clear++;
+ } else if (strncmp(s1, "mgs", 3) == 0) {
+ /* We are an MGS */
+ lmd->lmd_flags |= LMD_FLG_MGS;
+ clear++;
+ } else if (strncmp(s1, "svname=", 7) == 0) {
+ rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+ if (rc)
+ goto invalid;
+ clear++;
+ } else if (strncmp(s1, "param=", 6) == 0) {
+ int length;
+ char *tail = strchr(s1 + 6, ',');
+ if (tail == NULL)
+ length = strlen(s1);
+ else
+ length = tail - s1;
+ length -= 6;
+ strncat(lmd->lmd_params, s1 + 6, length);
+ strcat(lmd->lmd_params, " ");
+ clear++;
+ } else if (strncmp(s1, "osd=", 4) == 0) {
+ rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+ if (rc)
+ goto invalid;
+ clear++;
+ }
+ /* Linux 2.4 doesn't pass the device, so we stuck it at the
+ end of the options. */
+ else if (strncmp(s1, "device=", 7) == 0) {
+ devname = s1 + 7;
+ /* terminate options right before device. device
+ must be the last one. */
+ *s1 = '\0';
+ break;
+ }
+
+ /* Find next opt */
+ s2 = strchr(s1, ',');
+ if (s2 == NULL) {
+ if (clear)
+ *s1 = '\0';
+ break;
+ }
+ s2++;
+ if (clear)
+ memmove(s1, s2, strlen(s2) + 1);
+ else
+ s1 = s2;
+ }
+
+ if (!devname) {
+ LCONSOLE_ERROR_MSG(0x164, "Can't find the device name (need mount option 'device=...')\n");
+ goto invalid;
+ }
+
+ s1 = strstr(devname, ":/");
+ if (s1) {
+ ++s1;
+ lmd->lmd_flags |= LMD_FLG_CLIENT;
+ /* Remove leading /s from fsname */
+ while (*++s1 == '/') ;
+ /* Freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+ if (!lmd->lmd_profile)
+ return -ENOMEM;
+ sprintf(lmd->lmd_profile, "%s-client", s1);
+ }
+
+ /* Freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+ if (!lmd->lmd_dev)
+ return -ENOMEM;
+ strcpy(lmd->lmd_dev, devname);
+
+ /* Save mount options */
+ s1 = options + strlen(options) - 1;
+ while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+ *s1-- = 0;
+ if (*options != 0) {
+ /* Freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+ if (!lmd->lmd_opts)
+ return -ENOMEM;
+ strcpy(lmd->lmd_opts, options);
+ }
+
+ lmd_print(lmd);
+ lmd->lmd_magic = LMD_MAGIC;
+
+ return rc;
+
+invalid:
+ CERROR("Bad mount options %s\n", options);
+ return -EINVAL;
+}
+
+struct lustre_mount_data2 {
+ void *lmd2_data;
+ struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct lustre_mount_data *lmd;
+ struct lustre_mount_data2 *lmd2 = data;
+ struct lustre_sb_info *lsi;
+ int rc;
+
+ CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+ lsi = lustre_init_lsi(sb);
+ if (!lsi)
+ return -ENOMEM;
+ lmd = lsi->lsi_lmd;
+
+ /*
+ * Disable lockdep during mount, because mount locking patterns are
+ * `special'.
+ */
+ lockdep_off();
+
+ /*
+ * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+ */
+ obd_zombie_barrier();
+
+ /* Figure out the lmd from the mount options */
+ if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+ lustre_put_lsi(sb);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (lmd_is_client(lmd)) {
+ CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+ if (client_fill_super == NULL)
+ request_module("lustre");
+ if (client_fill_super == NULL) {
+ LCONSOLE_ERROR_MSG(0x165, "Nothing registered for client mount! Is the 'lustre' module loaded?\n");
+ lustre_put_lsi(sb);
+ rc = -ENODEV;
+ } else {
+ rc = lustre_start_mgc(sb);
+ if (rc) {
+ lustre_put_lsi(sb);
+ goto out;
+ }
+ /* Connect and start */
+ /* (should always be ll_fill_super) */
+ rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+ /* c_f_s will call lustre_common_put_super on failure */
+ }
+ } else {
+ CERROR("This is client-side-only module, cannot handle server mount.\n");
+ rc = -EINVAL;
+ }
+
+ /* If error happens in fill_super() call, @lsi will be killed there.
+ * This is why we do not put it here. */
+ goto out;
+out:
+ if (rc) {
+ CERROR("Unable to mount %s (%d)\n",
+ s2lsi(sb) ? lmd->lmd_dev : "", rc);
+ } else {
+ CDEBUG(D_SUPER, "Mount %s complete\n",
+ lmd->lmd_dev);
+ }
+ lockdep_on();
+ return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+ must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+ struct vfsmount *mnt))
+{
+ client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+ kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+ const char *devname, void *data)
+{
+ struct lustre_mount_data2 lmd2 = {
+ .lmd2_data = data,
+ .lmd2_mnt = NULL
+ };
+
+ return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+static void lustre_kill_super(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+
+ if (kill_super_cb && lsi && !IS_SERVER(lsi))
+ (*kill_super_cb)(sb);
+
+ kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "lustre",
+ .mount = lustre_mount,
+ .kill_sb = lustre_kill_super,
+ .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+ FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+MODULE_ALIAS_FS("lustre");
+
+int lustre_register_fs(void)
+{
+ return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+ return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644
index 000000000..307ffe347
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obdo.c
@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../include/obd_class.h"
+#include "../include/lustre/lustre_idl.h"
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+ dst->o_parent_oid = fid_oid(parent);
+ dst->o_parent_seq = fid_seq(parent);
+ dst->o_parent_ver = fid_ver(parent);
+ dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+ attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid)
+{
+ u32 newvalid = 0;
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+ valid, LTIME_S(src->i_mtime),
+ LTIME_S(src->i_ctime));
+
+ if (valid & OBD_MD_FLATIME) {
+ dst->o_atime = LTIME_S(src->i_atime);
+ newvalid |= OBD_MD_FLATIME;
+ }
+ if (valid & OBD_MD_FLMTIME) {
+ dst->o_mtime = LTIME_S(src->i_mtime);
+ newvalid |= OBD_MD_FLMTIME;
+ }
+ if (valid & OBD_MD_FLCTIME) {
+ dst->o_ctime = LTIME_S(src->i_ctime);
+ newvalid |= OBD_MD_FLCTIME;
+ }
+ if (valid & OBD_MD_FLSIZE) {
+ dst->o_size = i_size_read(src);
+ newvalid |= OBD_MD_FLSIZE;
+ }
+ if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */
+ dst->o_blocks = src->i_blocks;
+ newvalid |= OBD_MD_FLBLOCKS;
+ }
+ if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */
+ dst->o_blksize = 1 << src->i_blkbits;
+ newvalid |= OBD_MD_FLBLKSZ;
+ }
+ if (valid & OBD_MD_FLTYPE) {
+ dst->o_mode = (dst->o_mode & S_IALLUGO) |
+ (src->i_mode & S_IFMT);
+ newvalid |= OBD_MD_FLTYPE;
+ }
+ if (valid & OBD_MD_FLMODE) {
+ dst->o_mode = (dst->o_mode & S_IFMT) |
+ (src->i_mode & S_IALLUGO);
+ newvalid |= OBD_MD_FLMODE;
+ }
+ if (valid & OBD_MD_FLUID) {
+ dst->o_uid = from_kuid(&init_user_ns, src->i_uid);
+ newvalid |= OBD_MD_FLUID;
+ }
+ if (valid & OBD_MD_FLGID) {
+ dst->o_gid = from_kgid(&init_user_ns, src->i_gid);
+ newvalid |= OBD_MD_FLGID;
+ }
+ if (valid & OBD_MD_FLFLAGS) {
+ dst->o_flags = src->i_flags;
+ newvalid |= OBD_MD_FLFLAGS;
+ }
+ dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, u32 valid)
+{
+ CDEBUG(D_INODE, "src obdo "DOSTID" valid %#llx, dst obdo "DOSTID"\n",
+ POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+ if (valid & OBD_MD_FLATIME)
+ dst->o_atime = src->o_atime;
+ if (valid & OBD_MD_FLMTIME)
+ dst->o_mtime = src->o_mtime;
+ if (valid & OBD_MD_FLCTIME)
+ dst->o_ctime = src->o_ctime;
+ if (valid & OBD_MD_FLSIZE)
+ dst->o_size = src->o_size;
+ if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+ dst->o_blocks = src->o_blocks;
+ if (valid & OBD_MD_FLBLKSZ)
+ dst->o_blksize = src->o_blksize;
+ if (valid & OBD_MD_FLTYPE)
+ dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+ if (valid & OBD_MD_FLMODE)
+ dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+ if (valid & OBD_MD_FLUID)
+ dst->o_uid = src->o_uid;
+ if (valid & OBD_MD_FLGID)
+ dst->o_gid = src->o_gid;
+ if (valid & OBD_MD_FLFLAGS)
+ dst->o_flags = src->o_flags;
+ if (valid & OBD_MD_FLFID) {
+ dst->o_parent_seq = src->o_parent_seq;
+ dst->o_parent_ver = src->o_parent_ver;
+ }
+ if (valid & OBD_MD_FLGENER)
+ dst->o_parent_oid = src->o_parent_oid;
+ if (valid & OBD_MD_FLHANDLE)
+ dst->o_handle = src->o_handle;
+ if (valid & OBD_MD_FLCOOKIE)
+ dst->o_lcookie = src->o_lcookie;
+
+ dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, u32 compare)
+{
+ int res = 0;
+
+ if (compare & OBD_MD_FLATIME)
+ res |= dst->o_atime != src->o_atime;
+ if (compare & OBD_MD_FLMTIME)
+ res |= dst->o_mtime != src->o_mtime;
+ if (compare & OBD_MD_FLCTIME)
+ res |= dst->o_ctime != src->o_ctime;
+ if (compare & OBD_MD_FLSIZE)
+ res |= dst->o_size != src->o_size;
+ if (compare & OBD_MD_FLBLOCKS) /* allocation of space */
+ res |= dst->o_blocks != src->o_blocks;
+ if (compare & OBD_MD_FLBLKSZ)
+ res |= dst->o_blksize != src->o_blksize;
+ if (compare & OBD_MD_FLTYPE)
+ res |= ((dst->o_mode ^ src->o_mode) & S_IFMT) != 0;
+ if (compare & OBD_MD_FLMODE)
+ res |= ((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0;
+ if (compare & OBD_MD_FLUID)
+ res |= dst->o_uid != src->o_uid;
+ if (compare & OBD_MD_FLGID)
+ res |= dst->o_gid != src->o_gid;
+ if (compare & OBD_MD_FLFLAGS)
+ res |= dst->o_flags != src->o_flags;
+ if (compare & OBD_MD_FLNLINK)
+ res |= dst->o_nlink != src->o_nlink;
+ if (compare & OBD_MD_FLFID) {
+ res |= dst->o_parent_seq != src->o_parent_seq;
+ res |= dst->o_parent_ver != src->o_parent_ver;
+ }
+ if (compare & OBD_MD_FLGENER)
+ res |= dst->o_parent_oid != src->o_parent_oid;
+ /* XXX Don't know if these should be included here - wasn't previously
+ if ( compare & OBD_MD_FLINLINE )
+ res |= memcmp(dst->o_inline, src->o_inline);
+ */
+ return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+ ioobj->ioo_oid = oa->o_oi;
+ if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+ ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+ /* Since 2.4 this does not contain o_mode in the low 16 bits.
+ * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+ ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+ if (ia_valid & ATTR_ATIME) {
+ oa->o_atime = LTIME_S(attr->ia_atime);
+ oa->o_valid |= OBD_MD_FLATIME;
+ }
+ if (ia_valid & ATTR_MTIME) {
+ oa->o_mtime = LTIME_S(attr->ia_mtime);
+ oa->o_valid |= OBD_MD_FLMTIME;
+ }
+ if (ia_valid & ATTR_CTIME) {
+ oa->o_ctime = LTIME_S(attr->ia_ctime);
+ oa->o_valid |= OBD_MD_FLCTIME;
+ }
+ if (ia_valid & ATTR_SIZE) {
+ oa->o_size = attr->ia_size;
+ oa->o_valid |= OBD_MD_FLSIZE;
+ }
+ if (ia_valid & ATTR_MODE) {
+ oa->o_mode = attr->ia_mode;
+ oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+ if (!in_group_p(make_kgid(&init_user_ns, oa->o_gid)) &&
+ !capable(CFS_CAP_FSETID))
+ oa->o_mode &= ~S_ISGID;
+ }
+ if (ia_valid & ATTR_UID) {
+ oa->o_uid = from_kuid(&init_user_ns, attr->ia_uid);
+ oa->o_valid |= OBD_MD_FLUID;
+ }
+ if (ia_valid & ATTR_GID) {
+ oa->o_gid = from_kgid(&init_user_ns, attr->ia_gid);
+ oa->o_valid |= OBD_MD_FLGID;
+ }
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, u32 valid)
+{
+ valid &= oa->o_valid;
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE, "valid %#llx, new time %llu/%llu\n",
+ oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+ attr->ia_valid = 0;
+ if (valid & OBD_MD_FLATIME) {
+ LTIME_S(attr->ia_atime) = oa->o_atime;
+ attr->ia_valid |= ATTR_ATIME;
+ }
+ if (valid & OBD_MD_FLMTIME) {
+ LTIME_S(attr->ia_mtime) = oa->o_mtime;
+ attr->ia_valid |= ATTR_MTIME;
+ }
+ if (valid & OBD_MD_FLCTIME) {
+ LTIME_S(attr->ia_ctime) = oa->o_ctime;
+ attr->ia_valid |= ATTR_CTIME;
+ }
+ if (valid & OBD_MD_FLSIZE) {
+ attr->ia_size = oa->o_size;
+ attr->ia_valid |= ATTR_SIZE;
+ }
+#if 0 /* you shouldn't be able to change a file's type with setattr */
+ if (valid & OBD_MD_FLTYPE) {
+ attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+ attr->ia_valid |= ATTR_MODE;
+ }
+#endif
+ if (valid & OBD_MD_FLMODE) {
+ attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+ attr->ia_valid |= ATTR_MODE;
+ if (!in_group_p(make_kgid(&init_user_ns, oa->o_gid)) &&
+ !capable(CFS_CAP_FSETID))
+ attr->ia_mode &= ~S_ISGID;
+ }
+ if (valid & OBD_MD_FLUID) {
+ attr->ia_uid = make_kuid(&init_user_ns, oa->o_uid);
+ attr->ia_valid |= ATTR_UID;
+ }
+ if (valid & OBD_MD_FLGID) {
+ attr->ia_gid = make_kgid(&init_user_ns, oa->o_gid);
+ attr->ia_valid |= ATTR_GID;
+ }
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, u32 valid)
+{
+ iattr_from_obdo(&op_data->op_attr, oa, valid);
+ if (valid & OBD_MD_FLBLOCKS) {
+ op_data->op_attr_blocks = oa->o_blocks;
+ op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+ }
+ if (valid & OBD_MD_FLFLAGS) {
+ ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+ oa->o_flags;
+ op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+ }
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+ unsigned int valid)
+{
+ obdo_from_iattr(oa, &op_data->op_attr, valid);
+ if (valid & ATTR_BLOCKS) {
+ oa->o_blocks = op_data->op_attr_blocks;
+ oa->o_valid |= OBD_MD_FLBLOCKS;
+ }
+ if (valid & ATTR_ATTR_FLAG) {
+ oa->o_flags =
+ ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+ oa->o_valid |= OBD_MD_FLFLAGS;
+ }
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+ dobdo->o_size = cpu_to_le64(sobdo->o_size);
+ dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+ dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+ dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+ dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+ dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+ dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+ dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+ dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+ dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+ dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+ dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+ dobdo->o_size = le64_to_cpu(sobdo->o_size);
+ dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+ dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+ dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+ dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+ dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+ dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+ dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+ dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+ dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+ dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+ dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);
diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644
index 000000000..cc785ab3f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include "../include/lustre_export.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+ memset(osfs, 0, sizeof(*osfs));
+ osfs->os_type = sfs->f_type;
+ osfs->os_blocks = sfs->f_blocks;
+ osfs->os_bfree = sfs->f_bfree;
+ osfs->os_bavail = sfs->f_bavail;
+ osfs->os_files = sfs->f_files;
+ osfs->os_ffree = sfs->f_ffree;
+ osfs->os_bsize = sfs->f_bsize;
+ osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+ memset(sfs, 0, sizeof(*sfs));
+ sfs->f_type = osfs->os_type;
+ sfs->f_blocks = osfs->os_blocks;
+ sfs->f_bfree = osfs->os_bfree;
+ sfs->f_bavail = osfs->os_bavail;
+ sfs->f_files = osfs->os_files;
+ sfs->f_ffree = osfs->os_ffree;
+ sfs->f_bsize = osfs->os_bsize;
+ sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644
index 000000000..ff0a01bcf
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/uuid.c
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+ __u32 value;
+
+ LASSERT(nob <= sizeof(value));
+
+ for (value = 0; nob > 0; --nob)
+ value = (value << 8) | *((*ptr)++);
+ return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+ __u8 *ptr = in;
+
+ LASSERT(nr * sizeof(*uu) == sizeof(class_uuid_t));
+
+ while (nr-- > 0)
+ CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+ /* uu as an array of __u16's */
+ __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+ CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+ uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+ sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+ uuid[0], uuid[1], uuid[2], uuid[3],
+ uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustre/lustre/obdecho/Makefile b/drivers/staging/lustre/lustre/obdecho/Makefile
new file mode 100644
index 000000000..672028fc7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) += obdecho.o
+obdecho-y := echo_client.o lproc_echo.o
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c
new file mode 100644
index 000000000..d542e06d6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c
@@ -0,0 +1,2197 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_debug.h"
+#include "../include/lprocfs_status.h"
+#include "../include/cl_object.h"
+#include "../include/lustre_fid.h"
+#include "../include/lustre_acl.h"
+#include "../include/lustre_net.h"
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+struct echo_device {
+ struct cl_device ed_cl;
+ struct echo_client_obd *ed_ec;
+
+ struct cl_site ed_site_myself;
+ struct cl_site *ed_site;
+ struct lu_device *ed_next;
+ int ed_next_islov;
+};
+
+struct echo_object {
+ struct cl_object eo_cl;
+ struct cl_object_header eo_hdr;
+
+ struct echo_device *eo_dev;
+ struct list_head eo_obj_chain;
+ struct lov_stripe_md *eo_lsm;
+ atomic_t eo_npages;
+ int eo_deleted;
+};
+
+struct echo_object_conf {
+ struct cl_object_conf eoc_cl;
+ struct lov_stripe_md **eoc_md;
+};
+
+struct echo_page {
+ struct cl_page_slice ep_cl;
+ struct mutex ep_lock;
+ struct page *ep_vmpage;
+};
+
+struct echo_lock {
+ struct cl_lock_slice el_cl;
+ struct list_head el_chain;
+ struct echo_object *el_object;
+ __u64 el_cookie;
+ atomic_t el_refcount;
+};
+
+static int echo_client_setup(const struct lu_env *env,
+ struct obd_device *obddev,
+ struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+ return container_of0(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+ return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+ return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+ return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+ return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+ return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+ return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+ return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+ struct echo_thread_info *info;
+
+ info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+ LASSERT(info != NULL);
+ return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+ return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+/** @} echo_helpers */
+
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+ struct lov_stripe_md **lsm);
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_enqueue(struct echo_object *eco, u64 start,
+ u64 end, int mode, __u64 *cookie);
+static int cl_echo_cancel(struct echo_device *d, __u64 cookie);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+ struct page **pages, int npages, int async);
+
+static struct echo_thread_info *echo_env_info(const struct lu_env *env);
+
+struct echo_thread_info {
+ struct echo_object_conf eti_conf;
+ struct lustre_md eti_md;
+
+ struct cl_2queue eti_queue;
+ struct cl_io eti_io;
+ struct cl_lock_descr eti_descr;
+ struct lu_fid eti_fid;
+ struct lu_fid eti_fid2;
+};
+
+/* No session used right now */
+struct echo_session_info {
+ unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+
+static struct lu_kmem_descr echo_caches[] = {
+ {
+ .ckd_cache = &echo_lock_kmem,
+ .ckd_name = "echo_lock_kmem",
+ .ckd_size = sizeof(struct echo_lock)
+ },
+ {
+ .ckd_cache = &echo_object_kmem,
+ .ckd_name = "echo_object_kmem",
+ .ckd_size = sizeof(struct echo_object)
+ },
+ {
+ .ckd_cache = &echo_thread_kmem,
+ .ckd_name = "echo_thread_kmem",
+ .ckd_size = sizeof(struct echo_thread_info)
+ },
+ {
+ .ckd_cache = &echo_session_kmem,
+ .ckd_name = "echo_session_kmem",
+ .ckd_size = sizeof(struct echo_session_info)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static struct page *echo_page_vmpage(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ return cl2echo_page(slice)->ep_vmpage;
+}
+
+static int echo_page_own(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io, int nonblock)
+{
+ struct echo_page *ep = cl2echo_page(slice);
+
+ if (!nonblock)
+ mutex_lock(&ep->ep_lock);
+ else if (!mutex_trylock(&ep->ep_lock))
+ return -EAGAIN;
+ return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io)
+{
+ struct echo_page *ep = cl2echo_page(slice);
+
+ LASSERT(mutex_is_locked(&ep->ep_lock));
+ mutex_unlock(&ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ if (mutex_is_locked(&cl2echo_page(slice)->ep_lock))
+ return -EBUSY;
+ return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret)
+{
+ LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+ struct cl_page_slice *slice)
+{
+ struct echo_page *ep = cl2echo_page(slice);
+ struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+ struct page *vmpage = ep->ep_vmpage;
+
+ atomic_dec(&eco->eo_npages);
+ page_cache_release(vmpage);
+}
+
+static int echo_page_prep(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t printer)
+{
+ struct echo_page *ep = cl2echo_page(slice);
+
+ (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+ ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage);
+ return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+ .cpo_own = echo_page_own,
+ .cpo_disown = echo_page_disown,
+ .cpo_discard = echo_page_discard,
+ .cpo_vmpage = echo_page_vmpage,
+ .cpo_fini = echo_page_fini,
+ .cpo_print = echo_page_print,
+ .cpo_is_vmlocked = echo_page_is_vmlocked,
+ .io = {
+ [CRT_READ] = {
+ .cpo_prep = echo_page_prep,
+ .cpo_completion = echo_page_completion,
+ },
+ [CRT_WRITE] = {
+ .cpo_prep = echo_page_prep,
+ .cpo_completion = echo_page_completion,
+ }
+ }
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+ struct cl_lock_slice *slice)
+{
+ struct echo_lock *ecl = cl2echo_lock(slice);
+
+ LASSERT(list_empty(&ecl->el_chain));
+ OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static void echo_lock_delete(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct echo_lock *ecl = cl2echo_lock(slice);
+
+ LASSERT(list_empty(&ecl->el_chain));
+}
+
+static int echo_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *unused)
+{
+ return 1;
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+ .clo_fini = echo_lock_fini,
+ .clo_delete = echo_lock_delete,
+ .clo_fits_into = echo_lock_fits_into
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage)
+{
+ struct echo_page *ep = cl_object_page_slice(obj, page);
+ struct echo_object *eco = cl2echo_obj(obj);
+
+ ep->ep_vmpage = vmpage;
+ page_cache_get(vmpage);
+ mutex_init(&ep->ep_lock);
+ cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+ atomic_inc(&eco->eo_npages);
+ return 0;
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io)
+{
+ return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *unused)
+{
+ struct echo_lock *el;
+
+ OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, GFP_NOFS);
+ if (el != NULL) {
+ cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+ el->el_object = cl2echo_obj(obj);
+ INIT_LIST_HEAD(&el->el_chain);
+ atomic_set(&el->el_refcount, 0);
+ }
+ return el == NULL ? -ENOMEM : 0;
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf)
+{
+ return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+ .coo_page_init = echo_page_init,
+ .coo_lock_init = echo_lock_init,
+ .coo_io_init = echo_io_init,
+ .coo_conf_set = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf)
+{
+ struct echo_device *ed = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+ struct echo_client_obd *ec = ed->ed_ec;
+ struct echo_object *eco = cl2echo_obj(lu2cl(obj));
+ const struct cl_object_conf *cconf;
+ struct echo_object_conf *econf;
+
+ if (ed->ed_next) {
+ struct lu_object *below;
+ struct lu_device *under;
+
+ under = ed->ed_next;
+ below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+ under);
+ if (below == NULL)
+ return -ENOMEM;
+ lu_object_add(obj, below);
+ }
+
+ cconf = lu2cl_conf(conf);
+ econf = cl2echo_conf(cconf);
+
+ LASSERT(econf->eoc_md);
+ eco->eo_lsm = *econf->eoc_md;
+ /* clear the lsm pointer so that it won't get freed. */
+ *econf->eoc_md = NULL;
+
+ eco->eo_dev = ed;
+ atomic_set(&eco->eo_npages, 0);
+ cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+ spin_lock(&ec->ec_lock);
+ list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+ spin_unlock(&ec->ec_lock);
+
+ return 0;
+}
+
+/* taken from osc_unpackmd() */
+static int echo_alloc_memmd(struct echo_device *ed,
+ struct lov_stripe_md **lsmp)
+{
+ int lsm_size;
+
+ /* If export is lov/osc then use their obd method */
+ if (ed->ed_next != NULL)
+ return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp);
+ /* OFD has no unpackmd method, do everything here */
+ lsm_size = lov_stripe_md_size(1);
+
+ LASSERT(*lsmp == NULL);
+ OBD_ALLOC(*lsmp, lsm_size);
+ if (*lsmp == NULL)
+ return -ENOMEM;
+
+ OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+ if ((*lsmp)->lsm_oinfo[0] == NULL) {
+ OBD_FREE(*lsmp, lsm_size);
+ return -ENOMEM;
+ }
+
+ loi_init((*lsmp)->lsm_oinfo[0]);
+ (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+ ostid_set_seq_echo(&(*lsmp)->lsm_oi);
+
+ return lsm_size;
+}
+
+static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp)
+{
+ int lsm_size;
+
+ /* If export is lov/osc then use their obd method */
+ if (ed->ed_next != NULL)
+ return obd_free_memmd(ed->ed_ec->ec_exp, lsmp);
+ /* OFD has no unpackmd method, do everything here */
+ lsm_size = lov_stripe_md_size(1);
+
+ LASSERT(*lsmp != NULL);
+ OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+ OBD_FREE(*lsmp, lsm_size);
+ *lsmp = NULL;
+ return 0;
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+ struct echo_object *eco = cl2echo_obj(lu2cl(obj));
+ struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+
+ LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+ spin_lock(&ec->ec_lock);
+ list_del_init(&eco->eo_obj_chain);
+ spin_unlock(&ec->ec_lock);
+
+ lu_object_fini(obj);
+ lu_object_header_fini(obj->lo_header);
+
+ if (eco->eo_lsm)
+ echo_free_memmd(eco->eo_dev, &eco->eo_lsm);
+ OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *o)
+{
+ struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+ return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+ .loo_object_init = echo_object_init,
+ .loo_object_delete = NULL,
+ .loo_object_release = NULL,
+ .loo_object_free = echo_object_free,
+ .loo_object_print = echo_object_print,
+ .loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev)
+{
+ struct echo_object *eco;
+ struct lu_object *obj = NULL;
+
+ /* we're the top dev. */
+ LASSERT(hdr == NULL);
+ OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, GFP_NOFS);
+ if (eco != NULL) {
+ struct cl_object_header *hdr = &eco->eo_hdr;
+
+ obj = &echo_obj2cl(eco)->co_lu;
+ cl_object_header_init(hdr);
+ lu_object_init(obj, &hdr->coh_lu, dev);
+ lu_object_add_top(&hdr->coh_lu, obj);
+
+ eco->eo_cl.co_ops = &echo_cl_obj_ops;
+ obj->lo_ops = &echo_lu_obj_ops;
+ }
+ return obj;
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+ .ldo_object_alloc = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+static struct cl_device_operations echo_device_cl_ops = {
+};
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+ struct cl_site *site = &ed->ed_site_myself;
+ int rc;
+
+ /* initialize site */
+ rc = cl_site_init(site, &ed->ed_cl);
+ if (rc) {
+ CERROR("Cannot initialize site for echo client(%d)\n", rc);
+ return rc;
+ }
+
+ rc = lu_site_init_finish(&site->cs_lu);
+ if (rc)
+ return rc;
+
+ ed->ed_site = site;
+ return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+ if (ed->ed_site) {
+ cl_site_fini(ed->ed_site);
+ ed->ed_site = NULL;
+ }
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct echo_thread_info *info;
+
+ OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, GFP_NOFS);
+ if (info == NULL)
+ info = ERR_PTR(-ENOMEM);
+ return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct echo_thread_info *info = data;
+
+ OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static void echo_thread_key_exit(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_thread_key = {
+ .lct_tags = LCT_CL_THREAD,
+ .lct_init = echo_thread_key_init,
+ .lct_fini = echo_thread_key_fini,
+ .lct_exit = echo_thread_key_exit
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct echo_session_info *session;
+
+ OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, GFP_NOFS);
+ if (session == NULL)
+ session = ERR_PTR(-ENOMEM);
+ return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct echo_session_info *session = data;
+
+ OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static void echo_session_key_exit(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_session_key = {
+ .lct_tags = LCT_SESSION,
+ .lct_init = echo_session_key_init,
+ .lct_fini = echo_session_key_fini,
+ .lct_exit = echo_session_key_exit
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg)
+{
+ struct lu_device *next;
+ struct echo_device *ed;
+ struct cl_device *cd;
+ struct obd_device *obd = NULL; /* to keep compiler happy */
+ struct obd_device *tgt;
+ const char *tgt_type_name;
+ int rc;
+ int cleanup = 0;
+
+ OBD_ALLOC_PTR(ed);
+ if (ed == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ cleanup = 1;
+ cd = &ed->ed_cl;
+ rc = cl_device_init(cd, t);
+ if (rc)
+ goto out;
+
+ cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+ cd->cd_ops = &echo_device_cl_ops;
+
+ cleanup = 2;
+ obd = class_name2obd(lustre_cfg_string(cfg, 0));
+ LASSERT(obd != NULL);
+ LASSERT(env != NULL);
+
+ tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+ if (tgt == NULL) {
+ CERROR("Can not find tgt device %s\n",
+ lustre_cfg_string(cfg, 1));
+ rc = -ENODEV;
+ goto out;
+ }
+
+ next = tgt->obd_lu_dev;
+ if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+ CERROR("echo MDT client must be run on server\n");
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = echo_site_init(env, ed);
+ if (rc)
+ goto out;
+
+ cleanup = 3;
+
+ rc = echo_client_setup(env, obd, cfg);
+ if (rc)
+ goto out;
+
+ ed->ed_ec = &obd->u.echo_client;
+ cleanup = 4;
+
+ /* if echo client is to be stacked upon ost device, the next is
+ * NULL since ost is not a clio device so far */
+ if (next != NULL && !lu_device_is_cl(next))
+ next = NULL;
+
+ tgt_type_name = tgt->obd_type->typ_name;
+ if (next != NULL) {
+ LASSERT(next != NULL);
+ if (next->ld_site != NULL) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ next->ld_site = &ed->ed_site->cs_lu;
+ rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+ next->ld_type->ldt_name,
+ NULL);
+ if (rc)
+ goto out;
+
+ /* Tricky case, I have to determine the obd type since
+ * CLIO uses the different parameters to initialize
+ * objects for lov & osc. */
+ if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0)
+ ed->ed_next_islov = 1;
+ else
+ LASSERT(strcmp(tgt_type_name,
+ LUSTRE_OSC_NAME) == 0);
+ } else {
+ LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+ }
+
+ ed->ed_next = next;
+ return &cd->cd_lu_dev;
+out:
+ switch (cleanup) {
+ case 4: {
+ int rc2;
+
+ rc2 = echo_client_cleanup(obd);
+ if (rc2)
+ CERROR("Cleanup obd device %s error(%d)\n",
+ obd->obd_name, rc2);
+ }
+
+ case 3:
+ echo_site_fini(env, ed);
+ case 2:
+ cl_device_fini(&ed->ed_cl);
+ case 1:
+ OBD_FREE_PTR(ed);
+ case 0:
+ default:
+ break;
+ }
+ return ERR_PTR(rc);
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+ const char *name, struct lu_device *next)
+{
+ LBUG();
+ return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+ struct lu_device *d)
+{
+ struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+ struct lu_device *next = ed->ed_next;
+
+ while (next)
+ next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+ return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+ struct echo_lock *ecl,
+ int still_used)
+{
+ struct cl_lock *clk = echo_lock2cl(ecl);
+
+ cl_lock_get(clk);
+ cl_unuse(env, clk);
+ cl_lock_release(env, clk, "ec enqueue", ecl->el_object);
+ if (!still_used) {
+ cl_lock_mutex_get(env, clk);
+ cl_lock_cancel(env, clk);
+ cl_lock_delete(env, clk);
+ cl_lock_mutex_put(env, clk);
+ }
+ cl_lock_put(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+ struct lu_device *d)
+{
+ struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+ struct echo_client_obd *ec = ed->ed_ec;
+ struct echo_object *eco;
+ struct lu_device *next = ed->ed_next;
+
+ CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+ ed, next);
+
+ lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+ /* check if there are objects still alive.
+ * It shouldn't have any object because lu_site_purge would cleanup
+ * all of cached objects. Anyway, probably the echo device is being
+ * parallelly accessed.
+ */
+ spin_lock(&ec->ec_lock);
+ list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+ eco->eo_deleted = 1;
+ spin_unlock(&ec->ec_lock);
+
+ /* purge again */
+ lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+ CDEBUG(D_INFO,
+ "Waiting for the reference of echo object to be dropped\n");
+
+ /* Wait for the last reference to be dropped. */
+ spin_lock(&ec->ec_lock);
+ while (!list_empty(&ec->ec_objects)) {
+ spin_unlock(&ec->ec_lock);
+ CERROR("echo_client still has objects at cleanup time, wait for 1 second\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(cfs_time_seconds(1));
+ lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+ spin_lock(&ec->ec_lock);
+ }
+ spin_unlock(&ec->ec_lock);
+
+ LASSERT(list_empty(&ec->ec_locks));
+
+ CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+ echo_client_cleanup(d->ld_obd);
+
+ while (next)
+ next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+ LASSERT(ed->ed_site == lu2cl_site(d->ld_site));
+ echo_site_fini(env, ed);
+ cl_device_fini(&ed->ed_cl);
+ OBD_FREE_PTR(ed);
+
+ return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+ .ldto_init = echo_type_init,
+ .ldto_fini = echo_type_fini,
+
+ .ldto_start = echo_type_start,
+ .ldto_stop = echo_type_stop,
+
+ .ldto_device_alloc = echo_device_alloc,
+ .ldto_device_free = echo_device_free,
+ .ldto_device_init = echo_device_init,
+ .ldto_device_fini = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+ .ldt_tags = LU_DEVICE_CL,
+ .ldt_name = LUSTRE_ECHO_CLIENT_NAME,
+ .ldt_ops = &echo_device_type_ops,
+ .ldt_ctx_tags = LCT_CL_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+ struct lov_stripe_md **lsmp)
+{
+ struct lu_env *env;
+ struct echo_thread_info *info;
+ struct echo_object_conf *conf;
+ struct lov_stripe_md *lsm;
+ struct echo_object *eco;
+ struct cl_object *obj;
+ struct lu_fid *fid;
+ int refcheck;
+ int rc;
+
+ LASSERT(lsmp);
+ lsm = *lsmp;
+ LASSERT(lsm);
+ LASSERTF(ostid_id(&lsm->lsm_oi) != 0, DOSTID"\n", POSTID(&lsm->lsm_oi));
+ LASSERTF(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO, DOSTID"\n",
+ POSTID(&lsm->lsm_oi));
+
+ /* Never return an object if the obd is to be freed. */
+ if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+ return ERR_PTR(-ENODEV);
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return (void *)env;
+
+ info = echo_env_info(env);
+ conf = &info->eti_conf;
+ if (d->ed_next) {
+ if (!d->ed_next_islov) {
+ struct lov_oinfo *oinfo = lsm->lsm_oinfo[0];
+
+ LASSERT(oinfo != NULL);
+ oinfo->loi_oi = lsm->lsm_oi;
+ conf->eoc_cl.u.coc_oinfo = oinfo;
+ } else {
+ struct lustre_md *md;
+
+ md = &info->eti_md;
+ memset(md, 0, sizeof(*md));
+ md->lsm = lsm;
+ conf->eoc_cl.u.coc_md = md;
+ }
+ }
+ conf->eoc_md = lsmp;
+
+ fid = &info->eti_fid;
+ rc = ostid_to_fid(fid, &lsm->lsm_oi, 0);
+ if (rc != 0) {
+ eco = ERR_PTR(rc);
+ goto out;
+ }
+
+ /* In the function below, .hs_keycmp resolves to
+ * lu_obj_hop_keycmp() */
+ /* coverity[overrun-buffer-val] */
+ obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+ if (IS_ERR(obj)) {
+ eco = (void *)obj;
+ goto out;
+ }
+
+ eco = cl2echo_obj(obj);
+ if (eco->eo_deleted) {
+ cl_object_put(env, obj);
+ eco = ERR_PTR(-EAGAIN);
+ }
+
+out:
+ cl_env_put(env, &refcheck);
+ return eco;
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+ struct lu_env *env;
+ struct cl_object *obj = echo_obj2cl(eco);
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ /* an external function to kill an object? */
+ if (eco->eo_deleted) {
+ struct lu_object_header *loh = obj->co_lu.lo_header;
+
+ LASSERT(&eco->eo_hdr == luh2coh(loh));
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+ }
+
+ cl_object_put(env, obj);
+ cl_env_put(env, &refcheck);
+ return 0;
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+ u64 start, u64 end, int mode,
+ __u64 *cookie, __u32 enqflags)
+{
+ struct cl_io *io;
+ struct cl_lock *lck;
+ struct cl_object *obj;
+ struct cl_lock_descr *descr;
+ struct echo_thread_info *info;
+ int rc = -ENOMEM;
+
+ info = echo_env_info(env);
+ io = &info->eti_io;
+ descr = &info->eti_descr;
+ obj = echo_obj2cl(eco);
+
+ descr->cld_obj = obj;
+ descr->cld_start = cl_index(obj, start);
+ descr->cld_end = cl_index(obj, end);
+ descr->cld_mode = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+ descr->cld_enq_flags = enqflags;
+ io->ci_obj = obj;
+
+ lck = cl_lock_request(env, io, descr, "ec enqueue", eco);
+ if (lck) {
+ struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+ struct echo_lock *el;
+
+ rc = cl_wait(env, lck);
+ if (rc == 0) {
+ el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+ spin_lock(&ec->ec_lock);
+ if (list_empty(&el->el_chain)) {
+ list_add(&el->el_chain, &ec->ec_locks);
+ el->el_cookie = ++ec->ec_unique;
+ }
+ atomic_inc(&el->el_refcount);
+ *cookie = el->el_cookie;
+ spin_unlock(&ec->ec_lock);
+ } else {
+ cl_lock_release(env, lck, "ec enqueue", current);
+ }
+ }
+ return rc;
+}
+
+static int cl_echo_enqueue(struct echo_object *eco, u64 start, u64 end,
+ int mode, __u64 *cookie)
+{
+ struct echo_thread_info *info;
+ struct lu_env *env;
+ struct cl_io *io;
+ int refcheck;
+ int result;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ info = echo_env_info(env);
+ io = &info->eti_io;
+
+ io->ci_ignore_layout = 1;
+ result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco));
+ if (result < 0)
+ goto out;
+ LASSERT(result == 0);
+
+ result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0);
+ cl_io_fini(env, io);
+
+out:
+ cl_env_put(env, &refcheck);
+ return result;
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+ __u64 cookie)
+{
+ struct echo_client_obd *ec = ed->ed_ec;
+ struct echo_lock *ecl = NULL;
+ struct list_head *el;
+ int found = 0, still_used = 0;
+
+ LASSERT(ec != NULL);
+ spin_lock(&ec->ec_lock);
+ list_for_each(el, &ec->ec_locks) {
+ ecl = list_entry(el, struct echo_lock, el_chain);
+ CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie);
+ found = (ecl->el_cookie == cookie);
+ if (found) {
+ if (atomic_dec_and_test(&ecl->el_refcount))
+ list_del_init(&ecl->el_chain);
+ else
+ still_used = 1;
+ break;
+ }
+ }
+ spin_unlock(&ec->ec_lock);
+
+ if (!found)
+ return -ENOENT;
+
+ echo_lock_release(env, ecl, still_used);
+ return 0;
+}
+
+static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
+{
+ struct lu_env *env;
+ int refcheck;
+ int rc;
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ rc = cl_echo_cancel0(env, ed, cookie);
+
+ cl_env_put(env, &refcheck);
+ return rc;
+}
+
+static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type unused, struct cl_2queue *queue)
+{
+ struct cl_page *clp;
+ struct cl_page *temp;
+ int result = 0;
+
+ cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) {
+ int rc;
+
+ rc = cl_page_cache_add(env, io, clp, CRT_WRITE);
+ if (rc == 0)
+ continue;
+ result = result ?: rc;
+ }
+ return result;
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
+ struct page **pages, int npages, int async)
+{
+ struct lu_env *env;
+ struct echo_thread_info *info;
+ struct cl_object *obj = echo_obj2cl(eco);
+ struct echo_device *ed = eco->eo_dev;
+ struct cl_2queue *queue;
+ struct cl_io *io;
+ struct cl_page *clp;
+ struct lustre_handle lh = { 0 };
+ int page_size = cl_page_size(obj);
+ int refcheck;
+ int rc;
+ int i;
+
+ LASSERT((offset & ~CFS_PAGE_MASK) == 0);
+ LASSERT(ed->ed_next != NULL);
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ info = echo_env_info(env);
+ io = &info->eti_io;
+ queue = &info->eti_queue;
+
+ cl_2queue_init(queue);
+
+ io->ci_ignore_layout = 1;
+ rc = cl_io_init(env, io, CIT_MISC, obj);
+ if (rc < 0)
+ goto out;
+ LASSERT(rc == 0);
+
+
+ rc = cl_echo_enqueue0(env, eco, offset,
+ offset + npages * PAGE_CACHE_SIZE - 1,
+ rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+ CEF_NEVER);
+ if (rc < 0)
+ goto error_lock;
+
+ for (i = 0; i < npages; i++) {
+ LASSERT(pages[i]);
+ clp = cl_page_find(env, obj, cl_index(obj, offset),
+ pages[i], CPT_TRANSIENT);
+ if (IS_ERR(clp)) {
+ rc = PTR_ERR(clp);
+ break;
+ }
+ LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+ rc = cl_page_own(env, io, clp);
+ if (rc) {
+ LASSERT(clp->cp_state == CPS_FREEING);
+ cl_page_put(env, clp);
+ break;
+ }
+
+ cl_2queue_add(queue, clp);
+
+ /* drop the reference count for cl_page_find, so that the page
+ * will be freed in cl_2queue_fini. */
+ cl_page_put(env, clp);
+ cl_page_clip(env, clp, 0, page_size);
+
+ offset += page_size;
+ }
+
+ if (rc == 0) {
+ enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+ async = async && (typ == CRT_WRITE);
+ if (async)
+ rc = cl_echo_async_brw(env, io, typ, queue);
+ else
+ rc = cl_io_submit_sync(env, io, typ, queue, 0);
+ CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+ async ? "async" : "sync", rc);
+ }
+
+ cl_echo_cancel0(env, ed, lh.cookie);
+error_lock:
+ cl_2queue_discard(env, io, queue);
+ cl_2queue_disown(env, io, queue);
+ cl_2queue_fini(env, queue);
+ cl_io_fini(env, io);
+out:
+ cl_env_put(env, &refcheck);
+ return rc;
+}
+/** @} echo_exports */
+
+
+static u64 last_object_id;
+
+static int
+echo_copyout_lsm(struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
+{
+ struct lov_stripe_md *ulsm = _ulsm;
+ int nob, i;
+
+ nob = offsetof(struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]);
+ if (nob > ulsm_nob)
+ return -EINVAL;
+
+ if (copy_to_user(ulsm, lsm, sizeof(*ulsm)))
+ return -EFAULT;
+
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ if (copy_to_user(ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i],
+ sizeof(lsm->lsm_oinfo[0])))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int
+echo_copyin_lsm(struct echo_device *ed, struct lov_stripe_md *lsm,
+ void *ulsm, int ulsm_nob)
+{
+ struct echo_client_obd *ec = ed->ed_ec;
+ int i;
+
+ if (ulsm_nob < sizeof(*lsm))
+ return -EINVAL;
+
+ if (copy_from_user(lsm, ulsm, sizeof(*lsm)))
+ return -EFAULT;
+
+ if (lsm->lsm_stripe_count > ec->ec_nstripes ||
+ lsm->lsm_magic != LOV_MAGIC ||
+ (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
+ ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
+ return -EINVAL;
+
+
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ if (copy_from_user(lsm->lsm_oinfo[i],
+ ((struct lov_stripe_md *)ulsm)-> \
+ lsm_oinfo[i],
+ sizeof(lsm->lsm_oinfo[0])))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+ int on_target, struct obdo *oa, void *ulsm,
+ int ulsm_nob, struct obd_trans_info *oti)
+{
+ struct echo_object *eco;
+ struct echo_client_obd *ec = ed->ed_ec;
+ struct lov_stripe_md *lsm = NULL;
+ int rc;
+ int created = 0;
+
+ if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
+ (on_target || /* set_stripe */
+ ec->ec_nstripes != 0)) { /* LOV */
+ CERROR("No valid oid\n");
+ return -EINVAL;
+ }
+
+ rc = echo_alloc_memmd(ed, &lsm);
+ if (rc < 0) {
+ CERROR("Cannot allocate md: rc = %d\n", rc);
+ goto failed;
+ }
+
+ if (ulsm != NULL) {
+ int i, idx;
+
+ rc = echo_copyin_lsm(ed, lsm, ulsm, ulsm_nob);
+ if (rc != 0)
+ goto failed;
+
+ if (lsm->lsm_stripe_count == 0)
+ lsm->lsm_stripe_count = ec->ec_nstripes;
+
+ if (lsm->lsm_stripe_size == 0)
+ lsm->lsm_stripe_size = PAGE_CACHE_SIZE;
+
+ idx = cfs_rand();
+
+ /* setup stripes: indices + default ids if required */
+ for (i = 0; i < lsm->lsm_stripe_count; i++) {
+ if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0)
+ lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi;
+
+ lsm->lsm_oinfo[i]->loi_ost_idx =
+ (idx + i) % ec->ec_nstripes;
+ }
+ }
+
+ /* setup object ID here for !on_target and LOV hint */
+ if (oa->o_valid & OBD_MD_FLID) {
+ LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+ lsm->lsm_oi = oa->o_oi;
+ }
+
+ if (ostid_id(&lsm->lsm_oi) == 0)
+ ostid_set_id(&lsm->lsm_oi, ++last_object_id);
+
+ rc = 0;
+ if (on_target) {
+ /* Only echo objects are allowed to be created */
+ LASSERT((oa->o_valid & OBD_MD_FLGROUP) &&
+ (ostid_seq(&oa->o_oi) == FID_SEQ_ECHO));
+ rc = obd_create(env, ec->ec_exp, oa, &lsm, oti);
+ if (rc != 0) {
+ CERROR("Cannot create objects: rc = %d\n", rc);
+ goto failed;
+ }
+ created = 1;
+ }
+
+ /* See what object ID we were given */
+ oa->o_oi = lsm->lsm_oi;
+ oa->o_valid |= OBD_MD_FLID;
+
+ eco = cl_echo_object_find(ed, &lsm);
+ if (IS_ERR(eco)) {
+ rc = PTR_ERR(eco);
+ goto failed;
+ }
+ cl_echo_object_put(eco);
+
+ CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+
+ failed:
+ if (created && rc)
+ obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL);
+ if (lsm)
+ echo_free_memmd(ed, &lsm);
+ if (rc)
+ CERROR("create object failed with: rc = %d\n", rc);
+ return rc;
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+ struct obdo *oa)
+{
+ struct lov_stripe_md *lsm = NULL;
+ struct echo_object *eco;
+ int rc;
+
+ if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) {
+ /* disallow use of object id 0 */
+ CERROR("No valid oid\n");
+ return -EINVAL;
+ }
+
+ rc = echo_alloc_memmd(ed, &lsm);
+ if (rc < 0)
+ return rc;
+
+ lsm->lsm_oi = oa->o_oi;
+ if (!(oa->o_valid & OBD_MD_FLGROUP))
+ ostid_set_seq_echo(&lsm->lsm_oi);
+
+ rc = 0;
+ eco = cl_echo_object_find(ed, &lsm);
+ if (!IS_ERR(eco))
+ *ecop = eco;
+ else
+ rc = PTR_ERR(eco);
+ if (lsm)
+ echo_free_memmd(ed, &lsm);
+ return rc;
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+ if (cl_echo_object_put(eco))
+ CERROR("echo client: drop an object failed");
+}
+
+static void
+echo_get_stripe_off_id(struct lov_stripe_md *lsm, u64 *offp, u64 *idp)
+{
+ unsigned long stripe_count;
+ unsigned long stripe_size;
+ unsigned long width;
+ unsigned long woffset;
+ int stripe_index;
+ u64 offset;
+
+ if (lsm->lsm_stripe_count <= 1)
+ return;
+
+ offset = *offp;
+ stripe_size = lsm->lsm_stripe_size;
+ stripe_count = lsm->lsm_stripe_count;
+
+ /* width = # bytes in all stripes */
+ width = stripe_size * stripe_count;
+
+ /* woffset = offset within a width; offset = whole number of widths */
+ woffset = do_div(offset, width);
+
+ stripe_index = woffset / stripe_size;
+
+ *idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi);
+ *offp = offset * stripe_size + woffset % stripe_size;
+}
+
+static void
+echo_client_page_debug_setup(struct lov_stripe_md *lsm,
+ struct page *page, int rw, u64 id,
+ u64 offset, u64 count)
+{
+ char *addr;
+ u64 stripe_off;
+ u64 stripe_id;
+ int delta;
+
+ /* no partial pages on the client */
+ LASSERT(count == PAGE_CACHE_SIZE);
+
+ addr = kmap(page);
+
+ for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+ if (rw == OBD_BRW_WRITE) {
+ stripe_off = offset + delta;
+ stripe_id = id;
+ echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+ } else {
+ stripe_off = 0xdeadbeef00c0ffeeULL;
+ stripe_id = 0xdeadbeef00c0ffeeULL;
+ }
+ block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+ stripe_off, stripe_id);
+ }
+
+ kunmap(page);
+}
+
+static int echo_client_page_debug_check(struct lov_stripe_md *lsm,
+ struct page *page, u64 id,
+ u64 offset, u64 count)
+{
+ u64 stripe_off;
+ u64 stripe_id;
+ char *addr;
+ int delta;
+ int rc;
+ int rc2;
+
+ /* no partial pages on the client */
+ LASSERT(count == PAGE_CACHE_SIZE);
+
+ addr = kmap(page);
+
+ for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+ stripe_off = offset + delta;
+ stripe_id = id;
+ echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+
+ rc2 = block_debug_check("test_brw",
+ addr + delta, OBD_ECHO_BLOCK_SIZE,
+ stripe_off, stripe_id);
+ if (rc2 != 0) {
+ CERROR("Error in echo object %#llx\n", id);
+ rc = rc2;
+ }
+ }
+
+ kunmap(page);
+ return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+ struct echo_object *eco, u64 offset,
+ u64 count, int async,
+ struct obd_trans_info *oti)
+{
+ struct lov_stripe_md *lsm = eco->eo_lsm;
+ u32 npages;
+ struct brw_page *pga;
+ struct brw_page *pgp;
+ struct page **pages;
+ u64 off;
+ int i;
+ int rc;
+ int verify;
+ gfp_t gfp_mask;
+ int brw_flags = 0;
+
+ verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+ (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+ (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+ gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER;
+
+ LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+ LASSERT(lsm != NULL);
+ LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi));
+
+ if (count <= 0 ||
+ (count & (~CFS_PAGE_MASK)) != 0)
+ return -EINVAL;
+
+ /* XXX think again with misaligned I/O */
+ npages = count >> PAGE_CACHE_SHIFT;
+
+ if (rw == OBD_BRW_WRITE)
+ brw_flags = OBD_BRW_ASYNC;
+
+ OBD_ALLOC(pga, npages * sizeof(*pga));
+ if (pga == NULL)
+ return -ENOMEM;
+
+ OBD_ALLOC(pages, npages * sizeof(*pages));
+ if (pages == NULL) {
+ OBD_FREE(pga, npages * sizeof(*pga));
+ return -ENOMEM;
+ }
+
+ for (i = 0, pgp = pga, off = offset;
+ i < npages;
+ i++, pgp++, off += PAGE_CACHE_SIZE) {
+
+ LASSERT(pgp->pg == NULL); /* for cleanup */
+
+ rc = -ENOMEM;
+ OBD_PAGE_ALLOC(pgp->pg, gfp_mask);
+ if (pgp->pg == NULL)
+ goto out;
+
+ pages[i] = pgp->pg;
+ pgp->count = PAGE_CACHE_SIZE;
+ pgp->off = off;
+ pgp->flag = brw_flags;
+
+ if (verify)
+ echo_client_page_debug_setup(lsm, pgp->pg, rw,
+ ostid_id(&oa->o_oi), off,
+ pgp->count);
+ }
+
+ /* brw mode can only be used at client */
+ LASSERT(ed->ed_next != NULL);
+ rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+ if (rc != 0 || rw != OBD_BRW_READ)
+ verify = 0;
+
+ for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+ if (pgp->pg == NULL)
+ continue;
+
+ if (verify) {
+ int vrc;
+
+ vrc = echo_client_page_debug_check(lsm, pgp->pg,
+ ostid_id(&oa->o_oi),
+ pgp->off, pgp->count);
+ if (vrc != 0 && rc == 0)
+ rc = vrc;
+ }
+ OBD_PAGE_FREE(pgp->pg);
+ }
+ OBD_FREE(pga, npages * sizeof(*pga));
+ OBD_FREE(pages, npages * sizeof(*pages));
+ return rc;
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+ struct obd_export *exp, int rw,
+ struct obdo *oa, struct echo_object *eco,
+ u64 offset, u64 count,
+ u64 batch, struct obd_trans_info *oti,
+ int async)
+{
+ struct lov_stripe_md *lsm = eco->eo_lsm;
+ struct obd_ioobj ioo;
+ struct niobuf_local *lnb;
+ struct niobuf_remote *rnb;
+ u64 off;
+ u64 npages, tot_pages;
+ int i, ret = 0, brw_flags = 0;
+
+ if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 ||
+ (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi)))
+ return -EINVAL;
+
+ npages = batch >> PAGE_CACHE_SHIFT;
+ tot_pages = count >> PAGE_CACHE_SHIFT;
+
+ OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local));
+ OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote));
+
+ if (lnb == NULL || rnb == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (rw == OBD_BRW_WRITE && async)
+ brw_flags |= OBD_BRW_ASYNC;
+
+ obdo_to_ioobj(oa, &ioo);
+
+ off = offset;
+
+ for (; tot_pages; tot_pages -= npages) {
+ int lpages;
+
+ if (tot_pages < npages)
+ npages = tot_pages;
+
+ for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) {
+ rnb[i].offset = off;
+ rnb[i].len = PAGE_CACHE_SIZE;
+ rnb[i].flags = brw_flags;
+ }
+
+ ioo.ioo_bufcnt = npages;
+ oti->oti_transno = 0;
+
+ lpages = npages;
+ ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages,
+ lnb, oti, NULL);
+ if (ret != 0)
+ goto out;
+ LASSERT(lpages == npages);
+
+ for (i = 0; i < lpages; i++) {
+ struct page *page = lnb[i].page;
+
+ /* read past eof? */
+ if (page == NULL && lnb[i].rc == 0)
+ continue;
+
+ if (async)
+ lnb[i].flags |= OBD_BRW_ASYNC;
+
+ if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+ (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+ (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+ continue;
+
+ if (rw == OBD_BRW_WRITE)
+ echo_client_page_debug_setup(lsm, page, rw,
+ ostid_id(&oa->o_oi),
+ rnb[i].offset,
+ rnb[i].len);
+ else
+ echo_client_page_debug_check(lsm, page,
+ ostid_id(&oa->o_oi),
+ rnb[i].offset,
+ rnb[i].len);
+ }
+
+ ret = obd_commitrw(env, rw, exp, oa, 1, &ioo,
+ rnb, npages, lnb, oti, ret);
+ if (ret != 0)
+ goto out;
+
+ /* Reset oti otherwise it would confuse ldiskfs. */
+ memset(oti, 0, sizeof(*oti));
+
+ /* Reuse env context. */
+ lu_context_exit((struct lu_context *)&env->le_ctx);
+ lu_context_enter((struct lu_context *)&env->le_ctx);
+ }
+
+out:
+ if (lnb)
+ OBD_FREE(lnb, npages * sizeof(struct niobuf_local));
+ if (rnb)
+ OBD_FREE(rnb, npages * sizeof(struct niobuf_remote));
+ return ret;
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+ struct obd_export *exp,
+ struct obd_ioctl_data *data,
+ struct obd_trans_info *dummy_oti)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct echo_device *ed = obd2echo_dev(obd);
+ struct echo_client_obd *ec = ed->ed_ec;
+ struct obdo *oa = &data->ioc_obdo1;
+ struct echo_object *eco;
+ int rc;
+ int async = 1;
+ long test_mode;
+
+ LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+ rc = echo_get_object(&eco, ed, oa);
+ if (rc)
+ return rc;
+
+ oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+ /* OFD/obdfilter works only via prep/commit */
+ test_mode = (long)data->ioc_pbuf1;
+ if (test_mode == 1)
+ async = 0;
+
+ if (ed->ed_next == NULL && test_mode != 3) {
+ test_mode = 3;
+ data->ioc_plen1 = data->ioc_count;
+ }
+
+ /* Truncate batch size to maximum */
+ if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+ data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+ switch (test_mode) {
+ case 1:
+ /* fall through */
+ case 2:
+ rc = echo_client_kbrw(ed, rw, oa,
+ eco, data->ioc_offset,
+ data->ioc_count, async, dummy_oti);
+ break;
+ case 3:
+ rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa,
+ eco, data->ioc_offset,
+ data->ioc_count, data->ioc_plen1,
+ dummy_oti, async);
+ break;
+ default:
+ rc = -EINVAL;
+ }
+ echo_put_object(eco);
+ return rc;
+}
+
+static int
+echo_client_enqueue(struct obd_export *exp, struct obdo *oa,
+ int mode, u64 offset, u64 nob)
+{
+ struct echo_device *ed = obd2echo_dev(exp->exp_obd);
+ struct lustre_handle *ulh = &oa->o_handle;
+ struct echo_object *eco;
+ u64 end;
+ int rc;
+
+ if (ed->ed_next == NULL)
+ return -EOPNOTSUPP;
+
+ if (!(mode == LCK_PR || mode == LCK_PW))
+ return -EINVAL;
+
+ if ((offset & (~CFS_PAGE_MASK)) != 0 ||
+ (nob & (~CFS_PAGE_MASK)) != 0)
+ return -EINVAL;
+
+ rc = echo_get_object(&eco, ed, oa);
+ if (rc != 0)
+ return rc;
+
+ end = (nob == 0) ? ((u64) -1) : (offset + nob - 1);
+ rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie);
+ if (rc == 0) {
+ oa->o_valid |= OBD_MD_FLHANDLE;
+ CDEBUG(D_INFO, "Cookie is %#llx\n", ulh->cookie);
+ }
+ echo_put_object(eco);
+ return rc;
+}
+
+static int
+echo_client_cancel(struct obd_export *exp, struct obdo *oa)
+{
+ struct echo_device *ed = obd2echo_dev(exp->exp_obd);
+ __u64 cookie = oa->o_handle.cookie;
+
+ if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
+ return -EINVAL;
+
+ CDEBUG(D_INFO, "Cookie is %#llx\n", cookie);
+ return cl_echo_cancel(ed, cookie);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+ void *karg, void *uarg)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct echo_device *ed = obd2echo_dev(obd);
+ struct echo_client_obd *ec = ed->ed_ec;
+ struct echo_object *eco;
+ struct obd_ioctl_data *data = karg;
+ struct obd_trans_info dummy_oti;
+ struct lu_env *env;
+ struct oti_req_ack_lock *ack_lock;
+ struct obdo *oa;
+ struct lu_fid fid;
+ int rw = OBD_BRW_READ;
+ int rc = 0;
+ int i;
+
+ memset(&dummy_oti, 0, sizeof(dummy_oti));
+
+ oa = &data->ioc_obdo1;
+ if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+ oa->o_valid |= OBD_MD_FLGROUP;
+ ostid_set_seq_echo(&oa->o_oi);
+ }
+
+ /* This FID is unpacked just for validation at this point */
+ rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+ if (rc < 0)
+ return rc;
+
+ OBD_ALLOC_PTR(env);
+ if (env == NULL)
+ return -ENOMEM;
+
+ rc = lu_env_init(env, LCT_DT_THREAD);
+ if (rc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case OBD_IOC_CREATE: /* may create echo object */
+ if (!capable(CFS_CAP_SYS_ADMIN)) {
+ rc = -EPERM;
+ goto out;
+ }
+
+ rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1,
+ data->ioc_plen1, &dummy_oti);
+ goto out;
+
+ case OBD_IOC_DESTROY:
+ if (!capable(CFS_CAP_SYS_ADMIN)) {
+ rc = -EPERM;
+ goto out;
+ }
+
+ rc = echo_get_object(&eco, ed, oa);
+ if (rc == 0) {
+ rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm,
+ &dummy_oti, NULL, NULL);
+ if (rc == 0)
+ eco->eo_deleted = 1;
+ echo_put_object(eco);
+ }
+ goto out;
+
+ case OBD_IOC_GETATTR:
+ rc = echo_get_object(&eco, ed, oa);
+ if (rc == 0) {
+ struct obd_info oinfo = { { { 0 } } };
+
+ oinfo.oi_md = eco->eo_lsm;
+ oinfo.oi_oa = oa;
+ rc = obd_getattr(env, ec->ec_exp, &oinfo);
+ echo_put_object(eco);
+ }
+ goto out;
+
+ case OBD_IOC_SETATTR:
+ if (!capable(CFS_CAP_SYS_ADMIN)) {
+ rc = -EPERM;
+ goto out;
+ }
+
+ rc = echo_get_object(&eco, ed, oa);
+ if (rc == 0) {
+ struct obd_info oinfo = { { { 0 } } };
+
+ oinfo.oi_oa = oa;
+ oinfo.oi_md = eco->eo_lsm;
+
+ rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL);
+ echo_put_object(eco);
+ }
+ goto out;
+
+ case OBD_IOC_BRW_WRITE:
+ if (!capable(CFS_CAP_SYS_ADMIN)) {
+ rc = -EPERM;
+ goto out;
+ }
+
+ rw = OBD_BRW_WRITE;
+ /* fall through */
+ case OBD_IOC_BRW_READ:
+ rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti);
+ goto out;
+
+ case ECHO_IOC_GET_STRIPE:
+ rc = echo_get_object(&eco, ed, oa);
+ if (rc == 0) {
+ rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1,
+ data->ioc_plen1);
+ echo_put_object(eco);
+ }
+ goto out;
+
+ case ECHO_IOC_SET_STRIPE:
+ if (!capable(CFS_CAP_SYS_ADMIN)) {
+ rc = -EPERM;
+ goto out;
+ }
+
+ if (data->ioc_pbuf1 == NULL) { /* unset */
+ rc = echo_get_object(&eco, ed, oa);
+ if (rc == 0) {
+ eco->eo_deleted = 1;
+ echo_put_object(eco);
+ }
+ } else {
+ rc = echo_create_object(env, ed, 0, oa,
+ data->ioc_pbuf1,
+ data->ioc_plen1, &dummy_oti);
+ }
+ goto out;
+
+ case ECHO_IOC_ENQUEUE:
+ if (!capable(CFS_CAP_SYS_ADMIN)) {
+ rc = -EPERM;
+ goto out;
+ }
+
+ rc = echo_client_enqueue(exp, oa,
+ data->ioc_conn1, /* lock mode */
+ data->ioc_offset,
+ data->ioc_count);/*extent*/
+ goto out;
+
+ case ECHO_IOC_CANCEL:
+ rc = echo_client_cancel(exp, oa);
+ goto out;
+
+ default:
+ CERROR("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+ rc = -ENOTTY;
+ goto out;
+ }
+
+out:
+ lu_env_fini(env);
+ OBD_FREE_PTR(env);
+
+ /* XXX this should be in a helper also called by target_send_reply */
+ for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4;
+ i++, ack_lock++) {
+ if (!ack_lock->mode)
+ break;
+ ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+ }
+
+ return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+ struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+ struct echo_client_obd *ec = &obddev->u.echo_client;
+ struct obd_device *tgt;
+ struct obd_uuid echo_uuid = { "ECHO_UUID" };
+ struct obd_connect_data *ocd = NULL;
+ int rc;
+
+ if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+ CERROR("requires a TARGET OBD name\n");
+ return -EINVAL;
+ }
+
+ tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+ if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+ CERROR("device not attached or not set up (%s)\n",
+ lustre_cfg_string(lcfg, 1));
+ return -EINVAL;
+ }
+
+ spin_lock_init(&ec->ec_lock);
+ INIT_LIST_HEAD(&ec->ec_objects);
+ INIT_LIST_HEAD(&ec->ec_locks);
+ ec->ec_unique = 0;
+ ec->ec_nstripes = 0;
+
+ OBD_ALLOC(ocd, sizeof(*ocd));
+ if (ocd == NULL) {
+ CERROR("Can't alloc ocd connecting to %s\n",
+ lustre_cfg_string(lcfg, 1));
+ return -ENOMEM;
+ }
+
+ ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+ OBD_CONNECT_BRW_SIZE |
+ OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+ OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE |
+ OBD_CONNECT_FID;
+ ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+ ocd->ocd_version = LUSTRE_VERSION_CODE;
+ ocd->ocd_group = FID_SEQ_ECHO;
+
+ rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+ if (rc == 0) {
+ /* Turn off pinger because it connects to tgt obd directly. */
+ spin_lock(&tgt->obd_dev_lock);
+ list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+ spin_unlock(&tgt->obd_dev_lock);
+ }
+
+ OBD_FREE(ocd, sizeof(*ocd));
+
+ if (rc != 0) {
+ CERROR("fail to connect to device %s\n",
+ lustre_cfg_string(lcfg, 1));
+ return rc;
+ }
+
+ return rc;
+}
+
+static int echo_client_cleanup(struct obd_device *obddev)
+{
+ struct echo_client_obd *ec = &obddev->u.echo_client;
+ int rc;
+
+ if (!list_empty(&obddev->obd_exports)) {
+ CERROR("still has clients!\n");
+ return -EBUSY;
+ }
+
+ LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0);
+ rc = obd_disconnect(ec->ec_exp);
+ if (rc != 0)
+ CERROR("fail to disconnect device: %d\n", rc);
+
+ return rc;
+}
+
+static int echo_client_connect(const struct lu_env *env,
+ struct obd_export **exp,
+ struct obd_device *src, struct obd_uuid *cluuid,
+ struct obd_connect_data *data, void *localdata)
+{
+ int rc;
+ struct lustre_handle conn = { 0 };
+
+ rc = class_connect(&conn, src, cluuid);
+ if (rc == 0) {
+ *exp = class_conn2export(&conn);
+ }
+
+ return rc;
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+ int rc;
+
+ if (exp == NULL) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = class_disconnect(exp);
+ goto out;
+ out:
+ return rc;
+}
+
+static struct obd_ops echo_client_obd_ops = {
+ .o_owner = THIS_MODULE,
+ .o_iocontrol = echo_client_iocontrol,
+ .o_connect = echo_client_connect,
+ .o_disconnect = echo_client_disconnect
+};
+
+int echo_client_init(void)
+{
+ struct lprocfs_static_vars lvars = { NULL };
+ int rc;
+
+ lprocfs_echo_init_vars(&lvars);
+
+ rc = lu_kmem_init(echo_caches);
+ if (rc == 0) {
+ rc = class_register_type(&echo_client_obd_ops, NULL,
+ lvars.module_vars,
+ LUSTRE_ECHO_CLIENT_NAME,
+ &echo_device_type);
+ if (rc)
+ lu_kmem_fini(echo_caches);
+ }
+ return rc;
+}
+
+void echo_client_exit(void)
+{
+ class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+ lu_kmem_fini(echo_caches);
+}
+
+static int __init obdecho_init(void)
+{
+ struct lprocfs_static_vars lvars;
+
+ LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+ LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+ lprocfs_echo_init_vars(&lvars);
+
+
+ return echo_client_init();
+}
+
+static void /*__exit*/ obdecho_exit(void)
+{
+ echo_client_exit();
+
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Testing Echo OBD driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(obdecho_init);
+module_exit(obdecho_exit);
+
+/** @} echo_client */
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h
new file mode 100644
index 000000000..8e9dbc235
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/echo_internal.h
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID 1ULL
+#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE (4<<10)
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
new file mode 100644
index 000000000..0beb97db7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include "../include/lprocfs_status.h"
+#include "../include/obd_class.h"
+
+#if defined(CONFIG_PROC_FS)
+LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+ { "uuid", &echo_uuid_fops, NULL, 0 },
+ { NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(echo, numrefs);
+static struct lprocfs_vars lprocfs_echo_module_vars[] = {
+ { "num_refs", &echo_numrefs_fops, NULL, 0 },
+ { NULL }
+};
+
+void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = lprocfs_echo_module_vars;
+ lvars->obd_vars = lprocfs_echo_obd_vars;
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile
new file mode 100644
index 000000000..54927fba4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_LUSTRE_FS) += osc.o
+osc-y := osc_request.o osc_dev.o osc_object.o \
+ osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o
+osc-$(CONFIG_PROC_FS) += lproc_osc.o
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
new file mode 100644
index 000000000..15a662098
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c
@@ -0,0 +1,751 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/statfs.h>
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/lprocfs_status.h"
+#include <linux/seq_file.h>
+#include "osc_internal.h"
+
+static int osc_active_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+
+ LPROCFS_CLIMP_CHECK(dev);
+ seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+ LPROCFS_CLIMP_EXIT(dev);
+
+ return 0;
+}
+
+static ssize_t osc_active_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+ if (val < 0 || val > 1)
+ return -ERANGE;
+
+ /* opposite senses */
+ if (dev->u.cli.cl_import->imp_deactive == val)
+ rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+ else
+ CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osc_active);
+
+static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+ struct client_obd *cli = &dev->u.cli;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return 0;
+}
+
+static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct client_obd *cli = &dev->u.cli;
+ struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val < 1 || val > OSC_MAX_RIF_MAX)
+ return -ERANGE;
+
+ LPROCFS_CLIMP_CHECK(dev);
+ if (pool && val > cli->cl_max_rpcs_in_flight)
+ pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_max_rpcs_in_flight = val;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ LPROCFS_CLIMP_EXIT(dev);
+ return count;
+}
+LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
+
+static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+ struct client_obd *cli = &dev->u.cli;
+ long val;
+ int mult;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ val = cli->cl_dirty_max;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ mult = 1 << 20;
+ return lprocfs_seq_read_frac_helper(m, val, mult);
+}
+
+static ssize_t osc_max_dirty_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct client_obd *cli = &dev->u.cli;
+ int pages_number, mult, rc;
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+ if (rc)
+ return rc;
+
+ if (pages_number <= 0 ||
+ pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) ||
+ pages_number > totalram_pages / 4) /* 1/4 of RAM */
+ return -ERANGE;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_dirty_max = (u32)(pages_number << PAGE_CACHE_SHIFT);
+ osc_wake_cache_waiters(cli);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osc_max_dirty_mb);
+
+static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+ struct client_obd *cli = &dev->u.cli;
+ int shift = 20 - PAGE_CACHE_SHIFT;
+
+ seq_printf(m,
+ "used_mb: %d\n"
+ "busy_cnt: %d\n",
+ (atomic_read(&cli->cl_lru_in_list) +
+ atomic_read(&cli->cl_lru_busy)) >> shift,
+ atomic_read(&cli->cl_lru_busy));
+
+ return 0;
+}
+
+/* shrink the number of caching pages to a specific number */
+static ssize_t osc_cached_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct client_obd *cli = &dev->u.cli;
+ int pages_number, mult, rc;
+ char kernbuf[128];
+
+ if (count >= sizeof(kernbuf))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+ kernbuf[count] = 0;
+
+ mult = 1 << (20 - PAGE_CACHE_SHIFT);
+ buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
+ kernbuf;
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+ if (rc)
+ return rc;
+
+ if (pages_number < 0)
+ return -ERANGE;
+
+ rc = atomic_read(&cli->cl_lru_in_list) - pages_number;
+ if (rc > 0)
+ (void)osc_lru_shrink(cli, rc);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osc_cached_mb);
+
+static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+ struct client_obd *cli = &dev->u.cli;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ seq_printf(m, "%lu\n", cli->cl_dirty);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
+
+static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+ struct client_obd *cli = &dev->u.cli;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ seq_printf(m, "%lu\n", cli->cl_avail_grant);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return 0;
+}
+
+static ssize_t osc_cur_grant_bytes_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ struct client_obd *cli = &obd->u.cli;
+ int rc;
+ __u64 val;
+
+ if (obd == NULL)
+ return 0;
+
+ rc = lprocfs_write_u64_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ /* this is only for shrinking grant */
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ if (val >= cli->cl_avail_grant) {
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ return 0;
+ }
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ LPROCFS_CLIMP_CHECK(obd);
+ if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+ rc = osc_shrink_grant_to_target(cli, val);
+ LPROCFS_CLIMP_EXIT(obd);
+ if (rc)
+ return rc;
+ return count;
+}
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
+
+static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *dev = m->private;
+ struct client_obd *cli = &dev->u.cli;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ seq_printf(m, "%lu\n", cli->cl_lost_grant);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+
+static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+
+ if (obd == NULL)
+ return 0;
+ seq_printf(m, "%d\n", obd->u.cli.cl_grant_shrink_interval);
+ return 0;
+}
+
+static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ int val, rc;
+
+ if (obd == NULL)
+ return 0;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= 0)
+ return -ERANGE;
+
+ obd->u.cli.cl_grant_shrink_interval = val;
+
+ return count;
+}
+LPROC_SEQ_FOPS(osc_grant_shrink_interval);
+
+static int osc_checksum_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+
+ if (obd == NULL)
+ return 0;
+
+ seq_printf(m, "%d\n", obd->u.cli.cl_checksum ? 1 : 0);
+ return 0;
+}
+
+static ssize_t osc_checksum_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ int val, rc;
+
+ if (obd == NULL)
+ return 0;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ obd->u.cli.cl_checksum = (val ? 1 : 0);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osc_checksum);
+
+static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+ int i;
+ DECLARE_CKSUM_NAME;
+
+ if (obd == NULL)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+ if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+ continue;
+ if (obd->u.cli.cl_cksum_type == (1 << i))
+ seq_printf(m, "[%s] ", cksum_name[i]);
+ else
+ seq_printf(m, "%s ", cksum_name[i]);
+ }
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static ssize_t osc_checksum_type_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ int i;
+ DECLARE_CKSUM_NAME;
+ char kernbuf[10];
+
+ if (obd == NULL)
+ return 0;
+
+ if (count > sizeof(kernbuf) - 1)
+ return -EINVAL;
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+ if (count > 0 && kernbuf[count - 1] == '\n')
+ kernbuf[count - 1] = '\0';
+ else
+ kernbuf[count] = '\0';
+
+ for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+ if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+ continue;
+ if (!strcmp(kernbuf, cksum_name[i])) {
+ obd->u.cli.cl_cksum_type = 1 << i;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_checksum_type);
+
+static int osc_resend_count_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+
+ seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+ return 0;
+}
+
+static ssize_t osc_resend_count_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val < 0)
+ return -EINVAL;
+
+ atomic_set(&obd->u.cli.cl_resends, val);
+
+ return count;
+}
+LPROC_SEQ_FOPS(osc_resend_count);
+
+static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+ struct osc_device *od = obd2osc_dev(obd);
+
+ seq_printf(m, "%u\n", od->od_contention_time);
+ return 0;
+}
+
+static ssize_t osc_contention_seconds_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ struct osc_device *od = obd2osc_dev(obd);
+
+ return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?:
+ count;
+}
+LPROC_SEQ_FOPS(osc_contention_seconds);
+
+static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+ struct osc_device *od = obd2osc_dev(obd);
+
+ seq_printf(m, "%u\n", od->od_lockless_truncate);
+ return 0;
+}
+
+static ssize_t osc_lockless_truncate_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ struct osc_device *od = obd2osc_dev(obd);
+
+ return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?:
+ count;
+}
+LPROC_SEQ_FOPS(osc_lockless_truncate);
+
+static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+
+ seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_destroy_in_flight));
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
+
+static int osc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+ return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+
+static ssize_t osc_obd_max_pages_per_rpc_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+ struct client_obd *cli = &dev->u.cli;
+ struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
+ int chunk_mask, rc;
+ __u64 val;
+
+ rc = lprocfs_write_u64_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ /* if the max_pages is specified in bytes, convert to pages */
+ if (val >= ONE_MB_BRW_SIZE)
+ val >>= PAGE_CACHE_SHIFT;
+
+ LPROCFS_CLIMP_CHECK(dev);
+
+ chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1);
+ /* max_pages_per_rpc must be chunk aligned */
+ val = (val + ~chunk_mask) & chunk_mask;
+ if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) {
+ LPROCFS_CLIMP_EXIT(dev);
+ return -ERANGE;
+ }
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_max_pages_per_rpc = val;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ LPROCFS_CLIMP_EXIT(dev);
+ return count;
+}
+LPROC_SEQ_FOPS(osc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(osc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(osc, ping);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, import);
+LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+ { "uuid", &osc_uuid_fops, NULL, 0 },
+ { "ping", &osc_ping_fops, NULL, 0222 },
+ { "connect_flags", &osc_connect_flags_fops, NULL, 0 },
+ { "blocksize", &osc_blksize_fops, NULL, 0 },
+ { "kbytestotal", &osc_kbytestotal_fops, NULL, 0 },
+ { "kbytesfree", &osc_kbytesfree_fops, NULL, 0 },
+ { "kbytesavail", &osc_kbytesavail_fops, NULL, 0 },
+ { "filestotal", &osc_filestotal_fops, NULL, 0 },
+ { "filesfree", &osc_filesfree_fops, NULL, 0 },
+ /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/
+ { "ost_server_uuid", &osc_server_uuid_fops, NULL, 0 },
+ { "ost_conn_uuid", &osc_conn_uuid_fops, NULL, 0 },
+ { "active", &osc_active_fops, NULL },
+ { "max_pages_per_rpc", &osc_obd_max_pages_per_rpc_fops, NULL },
+ { "max_rpcs_in_flight", &osc_max_rpcs_in_flight_fops, NULL },
+ { "destroys_in_flight", &osc_destroys_in_flight_fops, NULL, 0 },
+ { "max_dirty_mb", &osc_max_dirty_mb_fops, NULL },
+ { "osc_cached_mb", &osc_cached_mb_fops, NULL },
+ { "cur_dirty_bytes", &osc_cur_dirty_bytes_fops, NULL, 0 },
+ { "cur_grant_bytes", &osc_cur_grant_bytes_fops, NULL },
+ { "cur_lost_grant_bytes", &osc_cur_lost_grant_bytes_fops, NULL, 0},
+ { "grant_shrink_interval", &osc_grant_shrink_interval_fops, NULL },
+ { "checksums", &osc_checksum_fops, NULL },
+ { "checksum_type", &osc_checksum_type_fops, NULL },
+ { "resend_count", &osc_resend_count_fops, NULL},
+ { "timeouts", &osc_timeouts_fops, NULL, 0 },
+ { "contention_seconds", &osc_contention_seconds_fops, NULL },
+ { "lockless_truncate", &osc_lockless_truncate_fops, NULL },
+ { "import", &osc_import_fops, NULL },
+ { "state", &osc_state_fops, NULL, 0 },
+ { "pinger_recov", &osc_pinger_recov_fops, NULL },
+ { NULL }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, numrefs);
+static struct lprocfs_vars lprocfs_osc_module_vars[] = {
+ { "num_refs", &osc_numrefs_fops, NULL, 0 },
+ { NULL }
+};
+
+#define pct(a, b) (b ? a * 100 / b : 0)
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+ struct timeval now;
+ struct obd_device *dev = seq->private;
+ struct client_obd *cli = &dev->u.cli;
+ unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+ int i;
+
+ do_gettimeofday(&now);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+
+ seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n",
+ now.tv_sec, (unsigned long)now.tv_usec);
+ seq_printf(seq, "read RPCs in flight: %d\n",
+ cli->cl_r_in_flight);
+ seq_printf(seq, "write RPCs in flight: %d\n",
+ cli->cl_w_in_flight);
+ seq_printf(seq, "pending write pages: %d\n",
+ atomic_read(&cli->cl_pending_w_pages));
+ seq_printf(seq, "pending read pages: %d\n",
+ atomic_read(&cli->cl_pending_r_pages));
+
+ seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
+ seq_puts(seq, "pages per rpc rpcs % cum % |");
+ seq_puts(seq, " rpcs % cum %\n");
+
+ read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+ write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+ read_cum = 0;
+ write_cum = 0;
+ for (i = 0; i < OBD_HIST_MAX; i++) {
+ unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+ unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+ read_cum += r;
+ write_cum += w;
+ seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n",
+ 1 << i, r, pct(r, read_tot),
+ pct(read_cum, read_tot), w,
+ pct(w, write_tot),
+ pct(write_cum, write_tot));
+ if (read_cum == read_tot && write_cum == write_tot)
+ break;
+ }
+
+ seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
+ seq_puts(seq, "rpcs in flight rpcs % cum % |");
+ seq_puts(seq, " rpcs % cum %\n");
+
+ read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+ write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+ read_cum = 0;
+ write_cum = 0;
+ for (i = 0; i < OBD_HIST_MAX; i++) {
+ unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+ unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+ read_cum += r;
+ write_cum += w;
+ seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n",
+ i, r, pct(r, read_tot),
+ pct(read_cum, read_tot), w,
+ pct(w, write_tot),
+ pct(write_cum, write_tot));
+ if (read_cum == read_tot && write_cum == write_tot)
+ break;
+ }
+
+ seq_puts(seq, "\n\t\t\tread\t\t\twrite\n");
+ seq_puts(seq, "offset rpcs % cum % |");
+ seq_puts(seq, " rpcs % cum %\n");
+
+ read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+ write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+ read_cum = 0;
+ write_cum = 0;
+ for (i = 0; i < OBD_HIST_MAX; i++) {
+ unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+ unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+ read_cum += r;
+ write_cum += w;
+ seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n",
+ (i == 0) ? 0 : 1 << (i - 1),
+ r, pct(r, read_tot), pct(read_cum, read_tot),
+ w, pct(w, write_tot), pct(write_cum, write_tot));
+ if (read_cum == read_tot && write_cum == write_tot)
+ break;
+ }
+
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return 0;
+}
+#undef pct
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct obd_device *dev = seq->private;
+ struct client_obd *cli = &dev->u.cli;
+
+ lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+ lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+ lprocfs_oh_clear(&cli->cl_read_page_hist);
+ lprocfs_oh_clear(&cli->cl_write_page_hist);
+ lprocfs_oh_clear(&cli->cl_read_offset_hist);
+ lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
+ return len;
+}
+
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+ struct timeval now;
+ struct obd_device *dev = seq->private;
+ struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+ do_gettimeofday(&now);
+
+ seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n",
+ now.tv_sec, (unsigned long)now.tv_usec);
+ seq_printf(seq, "lockless_write_bytes\t\t%llu\n",
+ stats->os_lockless_writes);
+ seq_printf(seq, "lockless_read_bytes\t\t%llu\n",
+ stats->os_lockless_reads);
+ seq_printf(seq, "lockless_truncate\t\t%llu\n",
+ stats->os_lockless_truncates);
+ return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct obd_device *dev = seq->private;
+ struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+ memset(stats, 0, sizeof(*stats));
+ return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lproc_osc_attach_seqstat(struct obd_device *dev)
+{
+ int rc;
+
+ rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644,
+ &osc_stats_fops, dev);
+ if (rc == 0)
+ rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644,
+ &osc_rpc_stats_fops, dev);
+
+ return rc;
+}
+
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = lprocfs_osc_module_vars;
+ lvars->obd_vars = lprocfs_osc_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
new file mode 100644
index 000000000..d44b3d4ff
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -0,0 +1,2944 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+ int state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+ struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+ int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+ struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+ struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+ unsigned int lost_grant);
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+ const char *func, int line);
+#define osc_extent_tree_dump(lvl, obj) \
+ osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
+
+/** \addtogroup osc
+ * @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+ char *buf = flags;
+ *buf++ = ext->oe_rw ? 'r' : 'w';
+ if (ext->oe_intree)
+ *buf++ = 'i';
+ if (ext->oe_srvlock)
+ *buf++ = 's';
+ if (ext->oe_hp)
+ *buf++ = 'h';
+ if (ext->oe_urgent)
+ *buf++ = 'u';
+ if (ext->oe_memalloc)
+ *buf++ = 'm';
+ if (ext->oe_trunc_pending)
+ *buf++ = 't';
+ if (ext->oe_fsync_wait)
+ *buf++ = 'Y';
+ *buf = 0;
+ return flags;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+ return list_empty(list) ? '-' : '+';
+}
+
+#define EXTSTR "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+static const char *oes_strings[] = {
+ "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL };
+
+#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \
+ struct osc_extent *__ext = (extent); \
+ char __buf[16]; \
+ \
+ CDEBUG(lvl, \
+ "extent %p@{" EXTSTR ", " \
+ "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \
+ /* ----- extent part 0 ----- */ \
+ __ext, EXTPARA(__ext), \
+ /* ----- part 1 ----- */ \
+ atomic_read(&__ext->oe_refc), \
+ atomic_read(&__ext->oe_users), \
+ list_empty_marker(&__ext->oe_link), \
+ oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \
+ __ext->oe_obj, \
+ /* ----- part 2 ----- */ \
+ __ext->oe_grants, __ext->oe_nr_pages, \
+ list_empty_marker(&__ext->oe_pages), \
+ waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \
+ __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner, \
+ /* ----- part 4 ----- */ \
+ ## __VA_ARGS__); \
+} while (0)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do { \
+ if (!(expr)) { \
+ OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \
+ osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \
+ LASSERT(expr); \
+ } \
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+ if (n == NULL)
+ return NULL;
+
+ return container_of(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+ if (ext == NULL)
+ return NULL;
+
+ LASSERT(ext->oe_intree);
+ return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+ if (ext == NULL)
+ return NULL;
+
+ LASSERT(ext->oe_intree);
+ return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+ return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+ const char *func, const int line)
+{
+ struct osc_object *obj = ext->oe_obj;
+ struct osc_async_page *oap;
+ int page_count;
+ int rc = 0;
+
+ if (!osc_object_is_locked(obj)) {
+ rc = 9;
+ goto out;
+ }
+
+ if (ext->oe_state >= OES_STATE_MAX) {
+ rc = 10;
+ goto out;
+ }
+
+ if (atomic_read(&ext->oe_refc) <= 0) {
+ rc = 20;
+ goto out;
+ }
+
+ if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) {
+ rc = 30;
+ goto out;
+ }
+
+ switch (ext->oe_state) {
+ case OES_INV:
+ if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+ rc = 35;
+ else
+ rc = 0;
+ goto out;
+ case OES_ACTIVE:
+ if (atomic_read(&ext->oe_users) == 0) {
+ rc = 40;
+ goto out;
+ }
+ if (ext->oe_hp) {
+ rc = 50;
+ goto out;
+ }
+ if (ext->oe_fsync_wait && !ext->oe_urgent) {
+ rc = 55;
+ goto out;
+ }
+ break;
+ case OES_CACHE:
+ if (ext->oe_grants == 0) {
+ rc = 60;
+ goto out;
+ }
+ if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) {
+ rc = 65;
+ goto out;
+ }
+ default:
+ if (atomic_read(&ext->oe_users) > 0) {
+ rc = 70;
+ goto out;
+ }
+ }
+
+ if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) {
+ rc = 80;
+ goto out;
+ }
+
+ if (ext->oe_osclock == NULL && ext->oe_grants > 0) {
+ rc = 90;
+ goto out;
+ }
+
+ if (ext->oe_osclock) {
+ struct cl_lock_descr *descr;
+ descr = &ext->oe_osclock->cll_descr;
+ if (!(descr->cld_start <= ext->oe_start &&
+ descr->cld_end >= ext->oe_max_end)) {
+ rc = 100;
+ goto out;
+ }
+ }
+
+ if (ext->oe_nr_pages > ext->oe_mppr) {
+ rc = 105;
+ goto out;
+ }
+
+ /* Do not verify page list if extent is in RPC. This is because an
+ * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+ if (ext->oe_state > OES_CACHE) {
+ rc = 0;
+ goto out;
+ }
+
+ if (!extent_debug) {
+ rc = 0;
+ goto out;
+ }
+
+ page_count = 0;
+ list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+ pgoff_t index = oap2cl_page(oap)->cp_index;
+ ++page_count;
+ if (index > ext->oe_end || index < ext->oe_start) {
+ rc = 110;
+ goto out;
+ }
+ }
+ if (page_count != ext->oe_nr_pages) {
+ rc = 120;
+ goto out;
+ }
+
+out:
+ if (rc != 0)
+ OSC_EXTENT_DUMP(D_ERROR, ext,
+ "%s:%d sanity check %p failed with rc = %d\n",
+ func, line, ext, rc);
+ return rc;
+}
+
+#define sanity_check_nolock(ext) \
+ osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({ \
+ int __res; \
+ osc_object_lock((ext)->oe_obj); \
+ __res = sanity_check_nolock(ext); \
+ osc_object_unlock((ext)->oe_obj); \
+ __res; \
+})
+
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+ struct osc_extent *ext)
+{
+ struct osc_extent *tmp;
+
+ LASSERT(osc_object_is_locked(obj));
+
+ if (!extent_debug)
+ return 0;
+
+ for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+ if (tmp == ext)
+ continue;
+ if (tmp->oe_end >= ext->oe_start &&
+ tmp->oe_start <= ext->oe_end)
+ return 1;
+ }
+ return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+ LASSERT(osc_object_is_locked(ext->oe_obj));
+ LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+ /* Never try to sanity check a state changing extent :-) */
+ /* LASSERT(sanity_check_nolock(ext) == 0); */
+
+ /* TODO: validate the state machine */
+ ext->oe_state = state;
+ wake_up_all(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+ struct osc_extent *ext;
+
+ OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS);
+ if (ext == NULL)
+ return NULL;
+
+ RB_CLEAR_NODE(&ext->oe_node);
+ ext->oe_obj = obj;
+ atomic_set(&ext->oe_refc, 1);
+ atomic_set(&ext->oe_users, 0);
+ INIT_LIST_HEAD(&ext->oe_link);
+ ext->oe_state = OES_INV;
+ INIT_LIST_HEAD(&ext->oe_pages);
+ init_waitqueue_head(&ext->oe_waitq);
+ ext->oe_osclock = NULL;
+
+ return ext;
+}
+
+static void osc_extent_free(struct osc_extent *ext)
+{
+ OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+ LASSERT(atomic_read(&ext->oe_refc) >= 0);
+ atomic_inc(&ext->oe_refc);
+ return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+ LASSERT(atomic_read(&ext->oe_refc) > 0);
+ if (atomic_dec_and_test(&ext->oe_refc)) {
+ LASSERT(list_empty(&ext->oe_link));
+ LASSERT(atomic_read(&ext->oe_users) == 0);
+ LASSERT(ext->oe_state == OES_INV);
+ LASSERT(!ext->oe_intree);
+
+ if (ext->oe_osclock) {
+ cl_lock_put(env, ext->oe_osclock);
+ ext->oe_osclock = NULL;
+ }
+ osc_extent_free(ext);
+ }
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+ LASSERT(atomic_read(&ext->oe_refc) > 1);
+ LASSERT(osc_object_is_locked(ext->oe_obj));
+ atomic_dec(&ext->oe_refc);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+ pgoff_t index)
+{
+ struct rb_node *n = obj->oo_root.rb_node;
+ struct osc_extent *tmp, *p = NULL;
+
+ LASSERT(osc_object_is_locked(obj));
+ while (n != NULL) {
+ tmp = rb_extent(n);
+ if (index < tmp->oe_start) {
+ n = n->rb_left;
+ } else if (index > tmp->oe_end) {
+ p = rb_extent(n);
+ n = n->rb_right;
+ } else {
+ return tmp;
+ }
+ }
+ return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+ pgoff_t index)
+{
+ struct osc_extent *ext;
+
+ ext = osc_extent_search(obj, index);
+ if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+ return osc_extent_get(ext);
+ return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+ struct rb_node **n = &obj->oo_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct osc_extent *tmp;
+
+ LASSERT(ext->oe_intree == 0);
+ LASSERT(ext->oe_obj == obj);
+ LASSERT(osc_object_is_locked(obj));
+ while (*n != NULL) {
+ tmp = rb_extent(*n);
+ parent = *n;
+
+ if (ext->oe_end < tmp->oe_start)
+ n = &(*n)->rb_left;
+ else if (ext->oe_start > tmp->oe_end)
+ n = &(*n)->rb_right;
+ else
+ EASSERTF(0, tmp, EXTSTR, EXTPARA(ext));
+ }
+ rb_link_node(&ext->oe_node, parent, n);
+ rb_insert_color(&ext->oe_node, &obj->oo_root);
+ osc_extent_get(ext);
+ ext->oe_intree = 1;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+ struct osc_object *obj = ext->oe_obj;
+ LASSERT(osc_object_is_locked(obj));
+ if (ext->oe_intree) {
+ rb_erase(&ext->oe_node, &obj->oo_root);
+ ext->oe_intree = 0;
+ /* rbtree held a refcount */
+ osc_extent_put_trust(ext);
+ }
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+ struct osc_object *obj = ext->oe_obj;
+
+ LASSERT(osc_object_is_locked(obj));
+ LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+ if (ext->oe_state == OES_CACHE) {
+ osc_extent_state_set(ext, OES_ACTIVE);
+ osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+ }
+ atomic_inc(&ext->oe_users);
+ list_del_init(&ext->oe_link);
+ return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+ LASSERT(osc_object_is_locked(ext->oe_obj));
+ LASSERT(list_empty(&ext->oe_pages));
+ osc_extent_erase(ext);
+ list_del_init(&ext->oe_link);
+ osc_extent_state_set(ext, OES_INV);
+ OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+ struct osc_object *obj = ext->oe_obj;
+
+ osc_object_lock(obj);
+ __osc_extent_remove(ext);
+ osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at chunk level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+ struct osc_extent *victim)
+{
+ struct osc_object *obj = cur->oe_obj;
+ pgoff_t chunk_start;
+ pgoff_t chunk_end;
+ int ppc_bits;
+
+ LASSERT(cur->oe_state == OES_CACHE);
+ LASSERT(osc_object_is_locked(obj));
+ if (victim == NULL)
+ return -EINVAL;
+
+ if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
+ return -EBUSY;
+
+ if (cur->oe_max_end != victim->oe_max_end)
+ return -ERANGE;
+
+ LASSERT(cur->oe_osclock == victim->oe_osclock);
+ ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+ chunk_start = cur->oe_start >> ppc_bits;
+ chunk_end = cur->oe_end >> ppc_bits;
+ if (chunk_start != (victim->oe_end >> ppc_bits) + 1 &&
+ chunk_end + 1 != victim->oe_start >> ppc_bits)
+ return -ERANGE;
+
+ OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+ cur->oe_start = min(cur->oe_start, victim->oe_start);
+ cur->oe_end = max(cur->oe_end, victim->oe_end);
+ cur->oe_grants += victim->oe_grants;
+ cur->oe_nr_pages += victim->oe_nr_pages;
+ /* only the following bits are needed to merge */
+ cur->oe_urgent |= victim->oe_urgent;
+ cur->oe_memalloc |= victim->oe_memalloc;
+ list_splice_init(&victim->oe_pages, &cur->oe_pages);
+ list_del_init(&victim->oe_link);
+ victim->oe_nr_pages = 0;
+
+ osc_extent_get(victim);
+ __osc_extent_remove(victim);
+ osc_extent_put(env, victim);
+
+ OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+ return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+ struct osc_object *obj = ext->oe_obj;
+
+ LASSERT(atomic_read(&ext->oe_users) > 0);
+ LASSERT(sanity_check(ext) == 0);
+ LASSERT(ext->oe_grants > 0);
+
+ if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+ LASSERT(ext->oe_state == OES_ACTIVE);
+ if (ext->oe_trunc_pending) {
+ /* a truncate process is waiting for this extent.
+ * This may happen due to a race, check
+ * osc_cache_truncate_start(). */
+ osc_extent_state_set(ext, OES_TRUNC);
+ ext->oe_trunc_pending = 0;
+ } else {
+ osc_extent_state_set(ext, OES_CACHE);
+ osc_update_pending(obj, OBD_BRW_WRITE,
+ ext->oe_nr_pages);
+
+ /* try to merge the previous and next extent. */
+ osc_extent_merge(env, ext, prev_extent(ext));
+ osc_extent_merge(env, ext, next_extent(ext));
+
+ if (ext->oe_urgent)
+ list_move_tail(&ext->oe_link,
+ &obj->oo_urgent_exts);
+ }
+ osc_object_unlock(obj);
+
+ osc_io_unplug_async(env, osc_cli(obj), obj);
+ }
+ osc_extent_put(env, ext);
+}
+
+static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+{
+ return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+struct osc_extent *osc_extent_find(const struct lu_env *env,
+ struct osc_object *obj, pgoff_t index,
+ int *grants)
+
+{
+ struct client_obd *cli = osc_cli(obj);
+ struct cl_lock *lock;
+ struct osc_extent *cur;
+ struct osc_extent *ext;
+ struct osc_extent *conflict = NULL;
+ struct osc_extent *found = NULL;
+ pgoff_t chunk;
+ pgoff_t max_end;
+ int max_pages; /* max_pages_per_rpc */
+ int chunksize;
+ int ppc_bits; /* pages per chunk bits */
+ int chunk_mask;
+ int rc;
+
+ cur = osc_extent_alloc(obj);
+ if (cur == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0);
+ LASSERT(lock != NULL);
+ LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+
+ LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
+ ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+ chunk_mask = ~((1 << ppc_bits) - 1);
+ chunksize = 1 << cli->cl_chunkbits;
+ chunk = index >> ppc_bits;
+
+ /* align end to rpc edge, rpc size may not be a power 2 integer. */
+ max_pages = cli->cl_max_pages_per_rpc;
+ LASSERT((max_pages & ~chunk_mask) == 0);
+ max_end = index - (index % max_pages) + max_pages - 1;
+ max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end);
+
+ /* initialize new extent by parameters so far */
+ cur->oe_max_end = max_end;
+ cur->oe_start = index & chunk_mask;
+ cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+ if (cur->oe_start < lock->cll_descr.cld_start)
+ cur->oe_start = lock->cll_descr.cld_start;
+ if (cur->oe_end > max_end)
+ cur->oe_end = max_end;
+ cur->oe_osclock = lock;
+ cur->oe_grants = 0;
+ cur->oe_mppr = max_pages;
+
+ /* grants has been allocated by caller */
+ LASSERTF(*grants >= chunksize + cli->cl_extent_tax,
+ "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax);
+ LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur));
+
+restart:
+ osc_object_lock(obj);
+ ext = osc_extent_search(obj, cur->oe_start);
+ if (ext == NULL)
+ ext = first_extent(obj);
+ while (ext != NULL) {
+ loff_t ext_chk_start = ext->oe_start >> ppc_bits;
+ loff_t ext_chk_end = ext->oe_end >> ppc_bits;
+
+ LASSERT(sanity_check_nolock(ext) == 0);
+ if (chunk > ext_chk_end + 1)
+ break;
+
+ /* if covering by different locks, no chance to match */
+ if (lock != ext->oe_osclock) {
+ EASSERTF(!overlapped(ext, cur), ext,
+ EXTSTR, EXTPARA(cur));
+
+ ext = next_extent(ext);
+ continue;
+ }
+
+ /* discontiguous chunks? */
+ if (chunk + 1 < ext_chk_start) {
+ ext = next_extent(ext);
+ continue;
+ }
+
+ /* ok, from now on, ext and cur have these attrs:
+ * 1. covered by the same lock
+ * 2. contiguous at chunk level or overlapping. */
+
+ if (overlapped(ext, cur)) {
+ /* cur is the minimum unit, so overlapping means
+ * full contain. */
+ EASSERTF((ext->oe_start <= cur->oe_start &&
+ ext->oe_end >= cur->oe_end),
+ ext, EXTSTR, EXTPARA(cur));
+
+ if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+ /* for simplicity, we wait for this extent to
+ * finish before going forward. */
+ conflict = osc_extent_get(ext);
+ break;
+ }
+
+ found = osc_extent_hold(ext);
+ break;
+ }
+
+ /* non-overlapped extent */
+ if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
+ /* we can't do anything for a non OES_CACHE extent, or
+ * if there is someone waiting for this extent to be
+ * flushed, try next one. */
+ ext = next_extent(ext);
+ continue;
+ }
+
+ /* check if they belong to the same rpc slot before trying to
+ * merge. the extents are not overlapped and contiguous at
+ * chunk level to get here. */
+ if (ext->oe_max_end != max_end) {
+ /* if they don't belong to the same RPC slot or
+ * max_pages_per_rpc has ever changed, do not merge. */
+ ext = next_extent(ext);
+ continue;
+ }
+
+ /* it's required that an extent must be contiguous at chunk
+ * level so that we know the whole extent is covered by grant
+ * (the pages in the extent are NOT required to be contiguous).
+ * Otherwise, it will be too much difficult to know which
+ * chunks have grants allocated. */
+
+ /* try to do front merge - extend ext's start */
+ if (chunk + 1 == ext_chk_start) {
+ /* ext must be chunk size aligned */
+ EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
+
+ /* pull ext's start back to cover cur */
+ ext->oe_start = cur->oe_start;
+ ext->oe_grants += chunksize;
+ *grants -= chunksize;
+
+ found = osc_extent_hold(ext);
+ } else if (chunk == ext_chk_end + 1) {
+ /* rear merge */
+ ext->oe_end = cur->oe_end;
+ ext->oe_grants += chunksize;
+ *grants -= chunksize;
+
+ /* try to merge with the next one because we just fill
+ * in a gap */
+ if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+ /* we can save extent tax from next extent */
+ *grants += cli->cl_extent_tax;
+
+ found = osc_extent_hold(ext);
+ }
+ if (found != NULL)
+ break;
+
+ ext = next_extent(ext);
+ }
+
+ osc_extent_tree_dump(D_CACHE, obj);
+ if (found != NULL) {
+ LASSERT(conflict == NULL);
+ if (!IS_ERR(found)) {
+ LASSERT(found->oe_osclock == cur->oe_osclock);
+ OSC_EXTENT_DUMP(D_CACHE, found,
+ "found caching ext for %lu.\n", index);
+ }
+ } else if (conflict == NULL) {
+ /* create a new extent */
+ EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+ cur->oe_grants = chunksize + cli->cl_extent_tax;
+ *grants -= cur->oe_grants;
+ LASSERT(*grants >= 0);
+
+ cur->oe_state = OES_CACHE;
+ found = osc_extent_hold(cur);
+ osc_extent_insert(obj, cur);
+ OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+ index, lock->cll_descr.cld_end);
+ }
+ osc_object_unlock(obj);
+
+ if (conflict != NULL) {
+ LASSERT(found == NULL);
+
+ /* waiting for IO to finish. Please notice that it's impossible
+ * to be an OES_TRUNC extent. */
+ rc = osc_extent_wait(env, conflict, OES_INV);
+ osc_extent_put(env, conflict);
+ conflict = NULL;
+ if (rc < 0) {
+ found = ERR_PTR(rc);
+ goto out;
+ }
+
+ goto restart;
+ }
+
+out:
+ osc_extent_put(env, cur);
+ LASSERT(*grants >= 0);
+ return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+ int sent, int rc)
+{
+ struct client_obd *cli = osc_cli(ext->oe_obj);
+ struct osc_async_page *oap;
+ struct osc_async_page *tmp;
+ int nr_pages = ext->oe_nr_pages;
+ int lost_grant = 0;
+ int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+ __u64 last_off = 0;
+ int last_count = -1;
+
+ OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+ ext->oe_rc = rc ?: ext->oe_nr_pages;
+ EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+ list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+ oap_pending_item) {
+ list_del_init(&oap->oap_rpc_item);
+ list_del_init(&oap->oap_pending_item);
+ if (last_off <= oap->oap_obj_off) {
+ last_off = oap->oap_obj_off;
+ last_count = oap->oap_count;
+ }
+
+ --ext->oe_nr_pages;
+ osc_ap_completion(env, cli, oap, sent, rc);
+ }
+ EASSERT(ext->oe_nr_pages == 0, ext);
+
+ if (!sent) {
+ lost_grant = ext->oe_grants;
+ } else if (blocksize < PAGE_CACHE_SIZE &&
+ last_count != PAGE_CACHE_SIZE) {
+ /* For short writes we shouldn't count parts of pages that
+ * span a whole chunk on the OST side, or our accounting goes
+ * wrong. Should match the code in filter_grant_check. */
+ int offset = oap->oap_page_off & ~CFS_PAGE_MASK;
+ int count = oap->oap_count + (offset & (blocksize - 1));
+ int end = (offset + oap->oap_count) & (blocksize - 1);
+ if (end)
+ count += blocksize - end;
+
+ lost_grant = PAGE_CACHE_SIZE - count;
+ }
+ if (ext->oe_grants > 0)
+ osc_free_grant(cli, nr_pages, lost_grant);
+
+ osc_extent_remove(ext);
+ /* put the refcount for RPC */
+ osc_extent_put(env, ext);
+ return 0;
+}
+
+static int extent_wait_cb(struct osc_extent *ext, int state)
+{
+ int ret;
+
+ osc_object_lock(ext->oe_obj);
+ ret = ext->oe_state == state;
+ osc_object_unlock(ext->oe_obj);
+
+ return ret;
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+ int state)
+{
+ struct osc_object *obj = ext->oe_obj;
+ struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+ LWI_ON_SIGNAL_NOOP, NULL);
+ int rc = 0;
+
+ osc_object_lock(obj);
+ LASSERT(sanity_check_nolock(ext) == 0);
+ /* `Kick' this extent only if the caller is waiting for it to be
+ * written out. */
+ if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp &&
+ !ext->oe_trunc_pending) {
+ if (ext->oe_state == OES_ACTIVE) {
+ ext->oe_urgent = 1;
+ } else if (ext->oe_state == OES_CACHE) {
+ ext->oe_urgent = 1;
+ osc_extent_hold(ext);
+ rc = 1;
+ }
+ }
+ osc_object_unlock(obj);
+ if (rc == 1)
+ osc_extent_release(env, ext);
+
+ /* wait for the extent until its state becomes @state */
+ rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi);
+ if (rc == -ETIMEDOUT) {
+ OSC_EXTENT_DUMP(D_ERROR, ext,
+ "%s: wait ext to %d timedout, recovery in progress?\n",
+ osc_export(obj)->exp_obd->obd_name, state);
+
+ lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+ rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state),
+ &lwi);
+ }
+ if (rc == 0 && ext->oe_rc < 0)
+ rc = ext->oe_rc;
+ return rc;
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+ bool partial)
+{
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ struct cl_io *io;
+ struct osc_object *obj = ext->oe_obj;
+ struct client_obd *cli = osc_cli(obj);
+ struct osc_async_page *oap;
+ struct osc_async_page *tmp;
+ int pages_in_chunk = 0;
+ int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+ __u64 trunc_chunk = trunc_index >> ppc_bits;
+ int grants = 0;
+ int nr_pages = 0;
+ int rc = 0;
+
+ LASSERT(sanity_check(ext) == 0);
+ EASSERT(ext->oe_state == OES_TRUNC, ext);
+ EASSERT(!ext->oe_urgent, ext);
+
+ /* Request new lu_env.
+ * We can't use that env from osc_cache_truncate_start() because
+ * it's from lov_io_sub and not fully initialized. */
+ env = cl_env_nested_get(&nest);
+ io = &osc_env_info(env)->oti_io;
+ io->ci_obj = cl_object_top(osc2cl(obj));
+ rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+ if (rc < 0)
+ goto out;
+
+ /* discard all pages with index greater then trunc_index */
+ list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+ oap_pending_item) {
+ struct cl_page *sub = oap2cl_page(oap);
+ struct cl_page *page = cl_page_top(sub);
+
+ LASSERT(list_empty(&oap->oap_rpc_item));
+
+ /* only discard the pages with their index greater than
+ * trunc_index, and ... */
+ if (sub->cp_index < trunc_index ||
+ (sub->cp_index == trunc_index && partial)) {
+ /* accounting how many pages remaining in the chunk
+ * so that we can calculate grants correctly. */
+ if (sub->cp_index >> ppc_bits == trunc_chunk)
+ ++pages_in_chunk;
+ continue;
+ }
+
+ list_del_init(&oap->oap_pending_item);
+
+ cl_page_get(page);
+ lu_ref_add(&page->cp_reference, "truncate", current);
+
+ if (cl_page_own(env, io, page) == 0) {
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ } else {
+ LASSERT(page->cp_state == CPS_FREEING);
+ LASSERT(0);
+ }
+
+ lu_ref_del(&page->cp_reference, "truncate", current);
+ cl_page_put(env, page);
+
+ --ext->oe_nr_pages;
+ ++nr_pages;
+ }
+ EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+ ext->oe_nr_pages == 0),
+ ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+ osc_object_lock(obj);
+ if (ext->oe_nr_pages == 0) {
+ LASSERT(pages_in_chunk == 0);
+ grants = ext->oe_grants;
+ ext->oe_grants = 0;
+ } else { /* calculate how many grants we can free */
+ int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+ pgoff_t last_index;
+
+
+ /* if there is no pages in this chunk, we can also free grants
+ * for the last chunk */
+ if (pages_in_chunk == 0) {
+ /* if this is the 1st chunk and no pages in this chunk,
+ * ext->oe_nr_pages must be zero, so we should be in
+ * the other if-clause. */
+ LASSERT(trunc_chunk > 0);
+ --trunc_chunk;
+ ++chunks;
+ }
+
+ /* this is what we can free from this extent */
+ grants = chunks << cli->cl_chunkbits;
+ ext->oe_grants -= grants;
+ last_index = ((trunc_chunk + 1) << ppc_bits) - 1;
+ ext->oe_end = min(last_index, ext->oe_max_end);
+ LASSERT(ext->oe_end >= ext->oe_start);
+ LASSERT(ext->oe_grants > 0);
+ }
+ osc_object_unlock(obj);
+
+ if (grants > 0 || nr_pages > 0)
+ osc_free_grant(cli, nr_pages, grants);
+
+out:
+ cl_io_fini(env, io);
+ cl_env_nested_put(&nest, env);
+ return rc;
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flushing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+ struct osc_extent *ext)
+{
+ struct osc_async_page *oap;
+ struct osc_async_page *last = NULL;
+ struct osc_object *obj = ext->oe_obj;
+ int page_count = 0;
+ int rc;
+
+ /* we're going to grab page lock, so object lock must not be taken. */
+ LASSERT(sanity_check(ext) == 0);
+ /* in locking state, any process should not touch this extent. */
+ EASSERT(ext->oe_state == OES_LOCKING, ext);
+ EASSERT(ext->oe_owner != NULL, ext);
+
+ OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+ list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+ ++page_count;
+ if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+ last = oap;
+
+ /* checking ASYNC_READY is race safe */
+ if ((oap->oap_async_flags & ASYNC_READY) != 0)
+ continue;
+
+ rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+ switch (rc) {
+ case 0:
+ spin_lock(&oap->oap_lock);
+ oap->oap_async_flags |= ASYNC_READY;
+ spin_unlock(&oap->oap_lock);
+ break;
+ case -EALREADY:
+ LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+ break;
+ default:
+ LASSERTF(0, "unknown return code: %d\n", rc);
+ }
+ }
+
+ LASSERT(page_count == ext->oe_nr_pages);
+ LASSERT(last != NULL);
+ /* the last page is the only one we need to refresh its count by
+ * the size of file. */
+ if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+ last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+ LASSERT(last->oap_count > 0);
+ LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE);
+ last->oap_async_flags |= ASYNC_COUNT_STABLE;
+ }
+
+ /* for the rest of pages, we don't need to call osf_refresh_count()
+ * because it's known they are not the last page */
+ list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+ if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+ oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+ oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+ }
+ }
+
+ osc_object_lock(obj);
+ osc_extent_state_set(ext, OES_RPC);
+ osc_object_unlock(obj);
+ /* get a refcount for RPC. */
+ osc_extent_get(ext);
+
+ return 0;
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants)
+{
+ struct osc_object *obj = ext->oe_obj;
+ struct client_obd *cli = osc_cli(obj);
+ struct osc_extent *next;
+ int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+ pgoff_t chunk = index >> ppc_bits;
+ pgoff_t end_chunk;
+ pgoff_t end_index;
+ int chunksize = 1 << cli->cl_chunkbits;
+ int rc = 0;
+
+ LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+ osc_object_lock(obj);
+ LASSERT(sanity_check_nolock(ext) == 0);
+ end_chunk = ext->oe_end >> ppc_bits;
+ if (chunk > end_chunk + 1) {
+ rc = -ERANGE;
+ goto out;
+ }
+
+ if (end_chunk >= chunk) {
+ rc = 0;
+ goto out;
+ }
+
+ LASSERT(end_chunk + 1 == chunk);
+ /* try to expand this extent to cover @index */
+ end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+ next = next_extent(ext);
+ if (next != NULL && next->oe_start <= end_index) {
+ /* complex mode - overlapped with the next extent,
+ * this case will be handled by osc_extent_find() */
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ ext->oe_end = end_index;
+ ext->oe_grants += chunksize;
+ *grants -= chunksize;
+ LASSERT(*grants >= 0);
+ EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+ "overlapped after expanding for %lu.\n", index);
+
+out:
+ osc_object_unlock(obj);
+ return rc;
+}
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+ const char *func, int line)
+{
+ struct osc_extent *ext;
+ int cnt;
+
+ CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
+ obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+ /* osc_object_lock(obj); */
+ cnt = 1;
+ for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+ OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
+
+ cnt = 1;
+ list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+ OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
+
+ cnt = 1;
+ list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+ OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
+
+ cnt = 1;
+ list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+ OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
+ /* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+ return !list_empty(&osc->oo_ready_item) ||
+ !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...) \
+ CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \
+ (OSC), osc_is_ready(OSC), \
+ list_empty_marker(&(OSC)->oo_hp_ready_item), \
+ list_empty_marker(&(OSC)->oo_ready_item), \
+ atomic_read(&(OSC)->oo_nr_writes), \
+ list_empty_marker(&(OSC)->oo_hp_exts), \
+ list_empty_marker(&(OSC)->oo_urgent_exts), \
+ atomic_read(&(OSC)->oo_nr_reads), \
+ list_empty_marker(&(OSC)->oo_reading_exts), \
+ ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+ int cmd)
+{
+ struct osc_page *opg = oap2osc_page(oap);
+ struct cl_page *page = cl_page_top(oap2cl_page(oap));
+ int result;
+
+ LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+ result = cl_page_make_ready(env, page, CRT_WRITE);
+ if (result == 0)
+ opg->ops_submit_time = cfs_time_current();
+ return result;
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+ struct osc_async_page *oap, int cmd)
+{
+ struct osc_page *opg = oap2osc_page(oap);
+ struct cl_page *page = oap2cl_page(oap);
+ struct cl_object *obj;
+ struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+
+ int result;
+ loff_t kms;
+
+ /* readpage queues with _COUNT_STABLE, shouldn't get here. */
+ LASSERT(!(cmd & OBD_BRW_READ));
+ LASSERT(opg != NULL);
+ obj = opg->ops_cl.cpl_obj;
+
+ cl_object_attr_lock(obj);
+ result = cl_object_attr_get(env, obj, attr);
+ cl_object_attr_unlock(obj);
+ if (result < 0)
+ return result;
+ kms = attr->cat_kms;
+ if (cl_offset(obj, page->cp_index) >= kms)
+ /* catch race with truncate */
+ return 0;
+ else if (cl_offset(obj, page->cp_index + 1) > kms)
+ /* catch sub-page write at end of file */
+ return kms % PAGE_CACHE_SIZE;
+ else
+ return PAGE_CACHE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+ int cmd, int rc)
+{
+ struct osc_page *opg = oap2osc_page(oap);
+ struct cl_page *page = cl_page_top(oap2cl_page(oap));
+ struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+ enum cl_req_type crt;
+ int srvlock;
+
+ cmd &= ~OBD_BRW_NOQUOTA;
+ LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ));
+ LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+ LASSERT(opg->ops_transfer_pinned);
+
+ /*
+ * page->cp_req can be NULL if io submission failed before
+ * cl_req was allocated.
+ */
+ if (page->cp_req != NULL)
+ cl_req_page_done(env, page);
+ LASSERT(page->cp_req == NULL);
+
+ crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+ /* Clear opg->ops_transfer_pinned before VM lock is released. */
+ opg->ops_transfer_pinned = 0;
+
+ spin_lock(&obj->oo_seatbelt);
+ LASSERT(opg->ops_submitter != NULL);
+ LASSERT(!list_empty(&opg->ops_inflight));
+ list_del_init(&opg->ops_inflight);
+ opg->ops_submitter = NULL;
+ spin_unlock(&obj->oo_seatbelt);
+
+ opg->ops_submit_time = 0;
+ srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+ /* statistic */
+ if (rc == 0 && srvlock) {
+ struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+ struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+ int bytes = oap->oap_count;
+
+ if (crt == CRT_READ)
+ stats->os_lockless_reads += bytes;
+ else
+ stats->os_lockless_writes += bytes;
+ }
+
+ /*
+ * This has to be the last operation with the page, as locks are
+ * released in cl_page_completion() and nothing except for the
+ * reference counter protects page from concurrent reclaim.
+ */
+ lu_ref_del(&page->cp_reference, "transfer", page);
+
+ cl_page_completion(env, page, crt, rc);
+
+ return 0;
+}
+
+#define OSC_DUMP_GRANT(cli, fmt, args...) do { \
+ struct client_obd *__tmp = (cli); \
+ CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d " \
+ "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt, \
+ __tmp->cl_import->imp_obd->obd_name, \
+ __tmp->cl_dirty, __tmp->cl_dirty_max, \
+ atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \
+ __tmp->cl_lost_grant, __tmp->cl_avail_grant, \
+ __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args); \
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+ struct brw_page *pga)
+{
+ assert_spin_locked(&cli->cl_loi_list_lock.lock);
+ LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+ atomic_inc(&obd_dirty_pages);
+ cli->cl_dirty += PAGE_CACHE_SIZE;
+ pga->flag |= OBD_BRW_FROM_GRANT;
+ CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+ PAGE_CACHE_SIZE, pga, pga->pg);
+ osc_update_next_shrink(cli);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+ struct brw_page *pga)
+{
+ assert_spin_locked(&cli->cl_loi_list_lock.lock);
+ if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+ return;
+ }
+
+ pga->flag &= ~OBD_BRW_FROM_GRANT;
+ atomic_dec(&obd_dirty_pages);
+ cli->cl_dirty -= PAGE_CACHE_SIZE;
+ if (pga->flag & OBD_BRW_NOCACHE) {
+ pga->flag &= ~OBD_BRW_NOCACHE;
+ atomic_dec(&obd_dirty_transit_pages);
+ cli->cl_dirty_transit -= PAGE_CACHE_SIZE;
+ }
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+ int rc = -EDQUOT;
+
+ if (cli->cl_avail_grant >= bytes) {
+ cli->cl_avail_grant -= bytes;
+ cli->cl_reserved_grant += bytes;
+ rc = 0;
+ }
+ return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+ unsigned int reserved, unsigned int unused)
+{
+ /* it's quite normal for us to get more grant than reserved.
+ * Thinking about a case that two extents merged by adding a new
+ * chunk, we can save one extent tax. If extent tax is greater than
+ * one chunk, we can save more grant by adding a new chunk */
+ cli->cl_reserved_grant -= reserved;
+ if (unused > reserved) {
+ cli->cl_avail_grant += reserved;
+ cli->cl_lost_grant += unused - reserved;
+ } else {
+ cli->cl_avail_grant += unused;
+ }
+}
+
+void osc_unreserve_grant(struct client_obd *cli,
+ unsigned int reserved, unsigned int unused)
+{
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ __osc_unreserve_grant(cli, reserved, unused);
+ if (unused > 0)
+ osc_wake_cache_waiters(cli);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ * written. In this case OST may use less chunks to serve this partial
+ * write. OSTs don't actually know the page size on the client side. so
+ * clients have to calculate lost grant by the blocksize on the OST.
+ * See filter_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+ unsigned int lost_grant)
+{
+ int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ atomic_sub(nr_pages, &obd_dirty_pages);
+ cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT;
+ cli->cl_lost_grant += lost_grant;
+ if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+ /* borrow some grant from truncate to avoid the case that
+ * truncate uses up all avail grant */
+ cli->cl_lost_grant -= grant;
+ cli->cl_avail_grant += grant;
+ }
+ osc_wake_cache_waiters(cli);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n",
+ lost_grant, cli->cl_lost_grant,
+ cli->cl_avail_grant, cli->cl_dirty);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ osc_release_write_grant(cli, &oap->oap_brw_page);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+ struct osc_async_page *oap,
+ int bytes, int transient)
+{
+ int rc;
+
+ OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+ rc = osc_reserve_grant(cli, bytes);
+ if (rc < 0)
+ return 0;
+
+ if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
+ atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) {
+ osc_consume_write_grant(cli, &oap->oap_brw_page);
+ if (transient) {
+ cli->cl_dirty_transit += PAGE_CACHE_SIZE;
+ atomic_inc(&obd_dirty_transit_pages);
+ oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+ }
+ rc = 1;
+ } else {
+ __osc_unreserve_grant(cli, bytes, bytes);
+ rc = 0;
+ }
+ return rc;
+}
+
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+{
+ int rc;
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ rc = list_empty(&ocw->ocw_entry);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ return rc;
+}
+
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+ struct osc_async_page *oap, int bytes)
+{
+ struct osc_object *osc = oap->oap_obj;
+ struct lov_oinfo *loi = osc->oo_oinfo;
+ struct osc_cache_waiter ocw;
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+ int rc = -EDQUOT;
+
+ OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+
+ /* force the caller to try sync io. this can jump the list
+ * of queued writes and create a discontiguous rpc stream */
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+ cli->cl_dirty_max < PAGE_CACHE_SIZE ||
+ cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) {
+ rc = -EDQUOT;
+ goto out;
+ }
+
+ /* Hopefully normal case - cache space and write credits available */
+ if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+ rc = 0;
+ goto out;
+ }
+
+ /* We can get here for two reasons: too many dirty pages in cache, or
+ * run out of grants. In both cases we should write dirty pages out.
+ * Adding a cache waiter will trigger urgent write-out no matter what
+ * RPC size will be.
+ * The exiting condition is no avail grants and no dirty pages caching,
+ * that really means there is no space on the OST. */
+ init_waitqueue_head(&ocw.ocw_waitq);
+ ocw.ocw_oap = oap;
+ ocw.ocw_grant = bytes;
+ while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) {
+ list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+ ocw.ocw_rc = 0;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ osc_io_unplug_async(env, cli, NULL);
+
+ CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+ cli->cl_import->imp_obd->obd_name, &ocw, oap);
+
+ rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+
+ /* l_wait_event is interrupted by signal */
+ if (rc < 0) {
+ list_del_init(&ocw.ocw_entry);
+ goto out;
+ }
+
+ LASSERT(list_empty(&ocw.ocw_entry));
+ rc = ocw.ocw_rc;
+
+ if (rc != -EDQUOT)
+ goto out;
+ if (osc_enter_cache_try(cli, oap, bytes, 0)) {
+ rc = 0;
+ goto out;
+ }
+ }
+out:
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ OSC_DUMP_GRANT(cli, "returned %d.\n", rc);
+ return rc;
+}
+
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+ struct list_head *l, *tmp;
+ struct osc_cache_waiter *ocw;
+
+ list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+ ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+ list_del_init(&ocw->ocw_entry);
+
+ ocw->ocw_rc = -EDQUOT;
+ /* we can't dirty more */
+ if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) ||
+ (atomic_read(&obd_dirty_pages) + 1 >
+ obd_max_dirty_pages)) {
+ CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %d\n",
+ cli->cl_dirty,
+ cli->cl_dirty_max, obd_max_dirty_pages);
+ goto wakeup;
+ }
+
+ ocw->ocw_rc = 0;
+ if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+ ocw->ocw_rc = -EDQUOT;
+
+wakeup:
+ CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+ ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+ wake_up(&ocw->ocw_waitq);
+ }
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+ int hprpc = !!list_empty(&osc->oo_hp_exts);
+ return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+ int cmd)
+{
+ int invalid_import = 0;
+
+ /* if we have an invalid import we want to drain the queued pages
+ * by forcing them through rpcs that immediately fail and complete
+ * the pages. recovery relies on this to empty the queued pages
+ * before canceling the locks and evicting down the llite pages */
+ if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+ invalid_import = 1;
+
+ if (cmd & OBD_BRW_WRITE) {
+ if (atomic_read(&osc->oo_nr_writes) == 0)
+ return 0;
+ if (invalid_import) {
+ CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+ return 1;
+ }
+ if (!list_empty(&osc->oo_hp_exts)) {
+ CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+ return 1;
+ }
+ if (!list_empty(&osc->oo_urgent_exts)) {
+ CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+ return 1;
+ }
+ /* trigger a write rpc stream as long as there are dirtiers
+ * waiting for space. as they're waiting, they're not going to
+ * create more pages to coalesce with what's waiting.. */
+ if (!list_empty(&cli->cl_cache_waiters)) {
+ CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+ return 1;
+ }
+ if (atomic_read(&osc->oo_nr_writes) >=
+ cli->cl_max_pages_per_rpc)
+ return 1;
+ } else {
+ if (atomic_read(&osc->oo_nr_reads) == 0)
+ return 0;
+ if (invalid_import) {
+ CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+ return 1;
+ }
+ /* all read are urgent. */
+ if (!list_empty(&osc->oo_reading_exts))
+ return 1;
+ }
+
+ return 0;
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+ struct client_obd *cli = osc_cli(obj);
+ if (cmd & OBD_BRW_WRITE) {
+ atomic_add(delta, &obj->oo_nr_writes);
+ atomic_add(delta, &cli->cl_pending_w_pages);
+ LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+ } else {
+ atomic_add(delta, &obj->oo_nr_reads);
+ atomic_add(delta, &cli->cl_pending_r_pages);
+ LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+ }
+ OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+ return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list, int should_be_on)
+{
+ if (list_empty(item) && should_be_on)
+ list_add_tail(item, list);
+ else if (!list_empty(item) && !should_be_on)
+ list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+ if (osc_makes_hprpc(osc)) {
+ /* HP rpc */
+ on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+ on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+ } else {
+ on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+ on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+ osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+ osc_makes_rpc(cli, osc, OBD_BRW_READ));
+ }
+
+ on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+ atomic_read(&osc->oo_nr_writes) > 0);
+
+ on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+ atomic_read(&osc->oo_nr_reads) > 0);
+
+ return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+ int is_ready;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ is_ready = __osc_list_maint(cli, osc);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return is_ready;
+}
+
+/* this is trying to propagate async writeback errors back up to the
+ * application. As an async write fails we record the error code for later if
+ * the app does an fsync. As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+ int rc)
+{
+ if (rc) {
+ if (!ar->ar_rc)
+ ar->ar_rc = rc;
+
+ ar->ar_force_sync = 1;
+ ar->ar_min_xid = ptlrpc_sample_next_xid();
+ return;
+
+ }
+
+ if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+ ar->ar_force_sync = 0;
+}
+
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+ struct osc_async_page *oap, int sent, int rc)
+{
+ struct osc_object *osc = oap->oap_obj;
+ struct lov_oinfo *loi = osc->oo_oinfo;
+ __u64 xid = 0;
+
+ if (oap->oap_request != NULL) {
+ xid = ptlrpc_req_xid(oap->oap_request);
+ ptlrpc_req_finished(oap->oap_request);
+ oap->oap_request = NULL;
+ }
+
+ /* As the transfer for this page is being done, clear the flags */
+ spin_lock(&oap->oap_lock);
+ oap->oap_async_flags = 0;
+ spin_unlock(&oap->oap_lock);
+ oap->oap_interrupted = 0;
+
+ if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ osc_process_ar(&cli->cl_ar, xid, rc);
+ osc_process_ar(&loi->loi_ar, xid, rc);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ }
+
+ rc = osc_completion(env, oap, oap->oap_cmd, rc);
+ if (rc)
+ CERROR("completion on oap %p obj %p returns %d.\n",
+ oap, osc, rc);
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+ struct osc_extent *ext, struct list_head *rpclist,
+ int *pc, unsigned int *max_pages)
+{
+ struct osc_extent *tmp;
+ struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
+ struct osc_async_page,
+ oap_pending_item);
+
+ EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+ ext);
+
+ *max_pages = max(ext->oe_mppr, *max_pages);
+ if (*pc + ext->oe_nr_pages > *max_pages)
+ return 0;
+
+ list_for_each_entry(tmp, rpclist, oe_link) {
+ struct osc_async_page *oap2;
+
+ oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
+ oap_pending_item);
+ EASSERT(tmp->oe_owner == current, tmp);
+#if 0
+ if (overlapped(tmp, ext)) {
+ OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
+ EASSERT(0, ext);
+ }
+#endif
+ if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
+ CDEBUG(D_CACHE, "Do not permit different type of IO"
+ " for a same RPC\n");
+ return 0;
+ }
+
+ if (tmp->oe_srvlock != ext->oe_srvlock ||
+ !tmp->oe_grants != !ext->oe_grants)
+ return 0;
+
+ /* remove break for strict check */
+ break;
+ }
+
+ *pc += ext->oe_nr_pages;
+ list_move_tail(&ext->oe_link, rpclist);
+ ext->oe_owner = current;
+ return 1;
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ * urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static int get_write_extents(struct osc_object *obj, struct list_head *rpclist)
+{
+ struct client_obd *cli = osc_cli(obj);
+ struct osc_extent *ext;
+ int page_count = 0;
+ unsigned int max_pages = cli->cl_max_pages_per_rpc;
+
+ LASSERT(osc_object_is_locked(obj));
+ while (!list_empty(&obj->oo_hp_exts)) {
+ ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
+ oe_link);
+ LASSERT(ext->oe_state == OES_CACHE);
+ if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+ &max_pages))
+ return page_count;
+ EASSERT(ext->oe_nr_pages <= max_pages, ext);
+ }
+ if (page_count == max_pages)
+ return page_count;
+
+ while (!list_empty(&obj->oo_urgent_exts)) {
+ ext = list_entry(obj->oo_urgent_exts.next,
+ struct osc_extent, oe_link);
+ if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+ &max_pages))
+ return page_count;
+
+ if (!ext->oe_intree)
+ continue;
+
+ while ((ext = next_extent(ext)) != NULL) {
+ if ((ext->oe_state != OES_CACHE) ||
+ (!list_empty(&ext->oe_link) &&
+ ext->oe_owner != NULL))
+ continue;
+
+ if (!try_to_add_extent_for_io(cli, ext, rpclist,
+ &page_count, &max_pages))
+ return page_count;
+ }
+ }
+ if (page_count == max_pages)
+ return page_count;
+
+ ext = first_extent(obj);
+ while (ext != NULL) {
+ if ((ext->oe_state != OES_CACHE) ||
+ /* this extent may be already in current rpclist */
+ (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) {
+ ext = next_extent(ext);
+ continue;
+ }
+
+ if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+ &max_pages))
+ return page_count;
+
+ ext = next_extent(ext);
+ }
+ return page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+ struct osc_object *osc, pdl_policy_t pol)
+{
+ LIST_HEAD(rpclist);
+ struct osc_extent *ext;
+ struct osc_extent *tmp;
+ struct osc_extent *first = NULL;
+ u32 page_count = 0;
+ int srvlock = 0;
+ int rc = 0;
+
+ LASSERT(osc_object_is_locked(osc));
+
+ page_count = get_write_extents(osc, &rpclist);
+ LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+ if (list_empty(&rpclist))
+ return 0;
+
+ osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+ list_for_each_entry(ext, &rpclist, oe_link) {
+ LASSERT(ext->oe_state == OES_CACHE ||
+ ext->oe_state == OES_LOCK_DONE);
+ if (ext->oe_state == OES_CACHE)
+ osc_extent_state_set(ext, OES_LOCKING);
+ else
+ osc_extent_state_set(ext, OES_RPC);
+ }
+
+ /* we're going to grab page lock, so release object lock because
+ * lock order is page lock -> object lock. */
+ osc_object_unlock(osc);
+
+ list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+ if (ext->oe_state == OES_LOCKING) {
+ rc = osc_extent_make_ready(env, ext);
+ if (unlikely(rc < 0)) {
+ list_del_init(&ext->oe_link);
+ osc_extent_finish(env, ext, 0, rc);
+ continue;
+ }
+ }
+ if (first == NULL) {
+ first = ext;
+ srvlock = ext->oe_srvlock;
+ } else {
+ LASSERT(srvlock == ext->oe_srvlock);
+ }
+ }
+
+ if (!list_empty(&rpclist)) {
+ LASSERT(page_count > 0);
+ rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol);
+ LASSERT(list_empty(&rpclist));
+ }
+
+ osc_object_lock(osc);
+ return rc;
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+ struct osc_object *osc, pdl_policy_t pol)
+{
+ struct osc_extent *ext;
+ struct osc_extent *next;
+ LIST_HEAD(rpclist);
+ int page_count = 0;
+ unsigned int max_pages = cli->cl_max_pages_per_rpc;
+ int rc = 0;
+
+ LASSERT(osc_object_is_locked(osc));
+ list_for_each_entry_safe(ext, next,
+ &osc->oo_reading_exts, oe_link) {
+ EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+ if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
+ &max_pages))
+ break;
+ osc_extent_state_set(ext, OES_RPC);
+ EASSERT(ext->oe_nr_pages <= max_pages, ext);
+ }
+ LASSERT(page_count <= max_pages);
+
+ osc_update_pending(osc, OBD_BRW_READ, -page_count);
+
+ if (!list_empty(&rpclist)) {
+ osc_object_unlock(osc);
+
+ LASSERT(page_count > 0);
+ rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol);
+ LASSERT(list_empty(&rpclist));
+
+ osc_object_lock(osc);
+ }
+ return rc;
+}
+
+#define list_to_obj(list, item) ({ \
+ struct list_head *__tmp = (list)->next; \
+ list_del_init(__tmp); \
+ list_entry(__tmp, struct osc_object, oo_##item); \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending. These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+ /* First return objects that have blocked locks so that they
+ * will be flushed quickly and other clients can get the lock,
+ * then objects which have pages ready to be stuffed into RPCs */
+ if (!list_empty(&cli->cl_loi_hp_ready_list))
+ return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item);
+ if (!list_empty(&cli->cl_loi_ready_list))
+ return list_to_obj(&cli->cl_loi_ready_list, ready_item);
+
+ /* then if we have cache waiters, return all objects with queued
+ * writes. This is especially important when many small files
+ * have filled up the cache and not been fired into rpcs because
+ * they don't pass the nr_pending/object threshold */
+ if (!list_empty(&cli->cl_cache_waiters) &&
+ !list_empty(&cli->cl_loi_write_list))
+ return list_to_obj(&cli->cl_loi_write_list, write_item);
+
+ /* then return all queued objects when we have an invalid import
+ * so that they get flushed */
+ if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+ if (!list_empty(&cli->cl_loi_write_list))
+ return list_to_obj(&cli->cl_loi_write_list, write_item);
+ if (!list_empty(&cli->cl_loi_read_list))
+ return list_to_obj(&cli->cl_loi_read_list, read_item);
+ }
+ return NULL;
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli,
+ pdl_policy_t pol)
+{
+ struct osc_object *osc;
+ int rc = 0;
+
+ while ((osc = osc_next_obj(cli)) != NULL) {
+ struct cl_object *obj = osc2cl(osc);
+ struct lu_ref_link link;
+
+ OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+ if (osc_max_rpc_in_flight(cli, osc)) {
+ __osc_list_maint(cli, osc);
+ break;
+ }
+
+ cl_object_get(obj);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ lu_object_ref_add_at(&obj->co_lu, &link, "check",
+ current);
+
+ /* attempt some read/write balancing by alternating between
+ * reads and writes in an object. The makes_rpc checks here
+ * would be redundant if we were getting read/write work items
+ * instead of objects. we don't want send_oap_rpc to drain a
+ * partial read pending queue when we're given this object to
+ * do io on writes while there are cache waiters */
+ osc_object_lock(osc);
+ if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+ rc = osc_send_write_rpc(env, cli, osc, pol);
+ if (rc < 0) {
+ CERROR("Write request failed with %d\n", rc);
+
+ /* osc_send_write_rpc failed, mostly because of
+ * memory pressure.
+ *
+ * It can't break here, because if:
+ * - a page was submitted by osc_io_submit, so
+ * page locked;
+ * - no request in flight
+ * - no subsequent request
+ * The system will be in live-lock state,
+ * because there is no chance to call
+ * osc_io_unplug() and osc_check_rpcs() any
+ * more. pdflush can't help in this case,
+ * because it might be blocked at grabbing
+ * the page lock as we mentioned.
+ *
+ * Anyway, continue to drain pages. */
+ /* break; */
+ }
+ }
+ if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+ rc = osc_send_read_rpc(env, cli, osc, pol);
+ if (rc < 0)
+ CERROR("Read request failed with %d\n", rc);
+ }
+ osc_object_unlock(osc);
+
+ osc_list_maint(cli, osc);
+ lu_object_ref_del_at(&obj->co_lu, &link, "check",
+ current);
+ cl_object_put(env, obj);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ }
+}
+
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+ struct osc_object *osc, pdl_policy_t pol, int async)
+{
+ int rc = 0;
+
+ if (osc != NULL && osc_list_maint(cli, osc) == 0)
+ return 0;
+
+ if (!async) {
+ /* disable osc_lru_shrink() temporarily to avoid
+ * potential stack overrun problem. LU-2859 */
+ atomic_inc(&cli->cl_lru_shrinkers);
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ osc_check_rpcs(env, cli, pol);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ atomic_dec(&cli->cl_lru_shrinkers);
+ } else {
+ CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli);
+ LASSERT(cli->cl_writeback_work != NULL);
+ rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+ }
+ return rc;
+}
+
+static int osc_io_unplug_async(const struct lu_env *env,
+ struct client_obd *cli, struct osc_object *osc)
+{
+ /* XXX: policy is no use actually. */
+ return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+ struct osc_object *osc, pdl_policy_t pol)
+{
+ (void)osc_io_unplug0(env, cli, osc, pol, 0);
+}
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+ struct page *page, loff_t offset)
+{
+ struct obd_export *exp = osc_export(osc);
+ struct osc_async_page *oap = &ops->ops_oap;
+
+ if (!page)
+ return cfs_size_round(sizeof(*oap));
+
+ oap->oap_magic = OAP_MAGIC;
+ oap->oap_cli = &exp->exp_obd->u.cli;
+ oap->oap_obj = osc;
+
+ oap->oap_page = page;
+ oap->oap_obj_off = offset;
+ LASSERT(!(offset & ~CFS_PAGE_MASK));
+
+ if (!client_is_remote(exp) && capable(CFS_CAP_SYS_RESOURCE))
+ oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
+ INIT_LIST_HEAD(&oap->oap_pending_item);
+ INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+ spin_lock_init(&oap->oap_lock);
+ CDEBUG(D_INFO, "oap %p page %p obj off %llu\n",
+ oap, page, oap->oap_obj_off);
+ return 0;
+}
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+ struct osc_page *ops)
+{
+ struct osc_io *oio = osc_env_io(env);
+ struct osc_extent *ext = NULL;
+ struct osc_async_page *oap = &ops->ops_oap;
+ struct client_obd *cli = oap->oap_cli;
+ struct osc_object *osc = oap->oap_obj;
+ pgoff_t index;
+ int grants = 0;
+ int brw_flags = OBD_BRW_ASYNC;
+ int cmd = OBD_BRW_WRITE;
+ int need_release = 0;
+ int rc = 0;
+
+ if (oap->oap_magic != OAP_MAGIC)
+ return -EINVAL;
+
+ if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+ return -EIO;
+
+ if (!list_empty(&oap->oap_pending_item) ||
+ !list_empty(&oap->oap_rpc_item))
+ return -EBUSY;
+
+ /* Set the OBD_BRW_SRVLOCK before the page is queued. */
+ brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+ if (!client_is_remote(osc_export(osc)) &&
+ capable(CFS_CAP_SYS_RESOURCE)) {
+ brw_flags |= OBD_BRW_NOQUOTA;
+ cmd |= OBD_BRW_NOQUOTA;
+ }
+
+ /* check if the file's owner/group is over quota */
+ if (!(cmd & OBD_BRW_NOQUOTA)) {
+ struct cl_object *obj;
+ struct cl_attr *attr;
+ unsigned int qid[MAXQUOTAS];
+
+ obj = cl_object_top(&osc->oo_cl);
+ attr = &osc_env_info(env)->oti_attr;
+
+ cl_object_attr_lock(obj);
+ rc = cl_object_attr_get(env, obj, attr);
+ cl_object_attr_unlock(obj);
+
+ qid[USRQUOTA] = attr->cat_uid;
+ qid[GRPQUOTA] = attr->cat_gid;
+ if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
+ rc = -EDQUOT;
+ if (rc)
+ return rc;
+ }
+
+ oap->oap_cmd = cmd;
+ oap->oap_page_off = ops->ops_from;
+ oap->oap_count = ops->ops_to - ops->ops_from;
+ oap->oap_async_flags = 0;
+ oap->oap_brw_flags = brw_flags;
+
+ OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+ oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+ index = oap2cl_page(oap)->cp_index;
+
+ /* Add this page into extent by the following steps:
+ * 1. if there exists an active extent for this IO, mostly this page
+ * can be added to the active extent and sometimes we need to
+ * expand extent to accommodate this page;
+ * 2. otherwise, a new extent will be allocated. */
+
+ ext = oio->oi_active;
+ if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+ /* one chunk plus extent overhead must be enough to write this
+ * page */
+ grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+ if (ext->oe_end >= index)
+ grants = 0;
+
+ /* it doesn't need any grant to dirty this page */
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ rc = osc_enter_cache_try(cli, oap, grants, 0);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ if (rc == 0) { /* try failed */
+ grants = 0;
+ need_release = 1;
+ } else if (ext->oe_end < index) {
+ int tmp = grants;
+ /* try to expand this extent */
+ rc = osc_extent_expand(ext, index, &tmp);
+ if (rc < 0) {
+ need_release = 1;
+ /* don't free reserved grant */
+ } else {
+ OSC_EXTENT_DUMP(D_CACHE, ext,
+ "expanded for %lu.\n", index);
+ osc_unreserve_grant(cli, grants, tmp);
+ grants = 0;
+ }
+ }
+ rc = 0;
+ } else if (ext != NULL) {
+ /* index is located outside of active extent */
+ need_release = 1;
+ }
+ if (need_release) {
+ osc_extent_release(env, ext);
+ oio->oi_active = NULL;
+ ext = NULL;
+ }
+
+ if (ext == NULL) {
+ int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+ /* try to find new extent to cover this page */
+ LASSERT(oio->oi_active == NULL);
+ /* we may have allocated grant for this page if we failed
+ * to expand the previous active extent. */
+ LASSERT(ergo(grants > 0, grants >= tmp));
+
+ rc = 0;
+ if (grants == 0) {
+ /* we haven't allocated grant for this page. */
+ rc = osc_enter_cache(env, cli, oap, tmp);
+ if (rc == 0)
+ grants = tmp;
+ }
+
+ tmp = grants;
+ if (rc == 0) {
+ ext = osc_extent_find(env, osc, index, &tmp);
+ if (IS_ERR(ext)) {
+ LASSERT(tmp == grants);
+ osc_exit_cache(cli, oap);
+ rc = PTR_ERR(ext);
+ ext = NULL;
+ } else {
+ oio->oi_active = ext;
+ }
+ }
+ if (grants > 0)
+ osc_unreserve_grant(cli, grants, tmp);
+ }
+
+ LASSERT(ergo(rc == 0, ext != NULL));
+ if (ext != NULL) {
+ EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+ ext, "index = %lu.\n", index);
+ LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+ osc_object_lock(osc);
+ if (ext->oe_nr_pages == 0)
+ ext->oe_srvlock = ops->ops_srvlock;
+ else
+ LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+ ++ext->oe_nr_pages;
+ list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+ osc_object_unlock(osc);
+ }
+ return rc;
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+ struct osc_object *obj, struct osc_page *ops)
+{
+ struct osc_async_page *oap = &ops->ops_oap;
+ struct osc_extent *ext = NULL;
+ int rc = 0;
+
+ LASSERT(oap->oap_magic == OAP_MAGIC);
+
+ CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+ oap, ops, oap2cl_page(oap)->cp_index);
+
+ osc_object_lock(obj);
+ if (!list_empty(&oap->oap_rpc_item)) {
+ CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+ rc = -EBUSY;
+ } else if (!list_empty(&oap->oap_pending_item)) {
+ ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index);
+ /* only truncated pages are allowed to be taken out.
+ * See osc_extent_truncate() and osc_cache_truncate_start()
+ * for details. */
+ if (ext != NULL && ext->oe_state != OES_TRUNC) {
+ OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+ oap2cl_page(oap)->cp_index);
+ rc = -EBUSY;
+ }
+ }
+ osc_object_unlock(obj);
+ if (ext != NULL)
+ osc_extent_put(env, ext);
+ return rc;
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+ struct osc_page *ops)
+{
+ struct osc_extent *ext = NULL;
+ struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj);
+ struct cl_page *cp = ops->ops_cl.cpl_page;
+ pgoff_t index = cp->cp_index;
+ struct osc_async_page *oap = &ops->ops_oap;
+ bool unplug = false;
+ int rc = 0;
+
+ osc_object_lock(obj);
+ ext = osc_extent_lookup(obj, index);
+ if (ext == NULL) {
+ osc_extent_tree_dump(D_ERROR, obj);
+ LASSERTF(0, "page index %lu is NOT covered.\n", index);
+ }
+
+ switch (ext->oe_state) {
+ case OES_RPC:
+ case OES_LOCK_DONE:
+ CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp),
+ "flush an in-rpc page?\n");
+ LASSERT(0);
+ break;
+ case OES_LOCKING:
+ /* If we know this extent is being written out, we should abort
+ * so that the writer can make this page ready. Otherwise, there
+ * exists a deadlock problem because other process can wait for
+ * page writeback bit holding page lock; and meanwhile in
+ * vvp_page_make_ready(), we need to grab page lock before
+ * really sending the RPC. */
+ case OES_TRUNC:
+ /* race with truncate, page will be redirtied */
+ case OES_ACTIVE:
+ /* The extent is active so we need to abort and let the caller
+ * re-dirty the page. If we continued on here, and we were the
+ * one making the extent active, we could deadlock waiting for
+ * the page writeback to clear but it won't because the extent
+ * is active and won't be written out. */
+ rc = -EAGAIN;
+ goto out;
+ default:
+ break;
+ }
+
+ rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE);
+ if (rc)
+ goto out;
+
+ spin_lock(&oap->oap_lock);
+ oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+ spin_unlock(&oap->oap_lock);
+
+ if (memory_pressure_get())
+ ext->oe_memalloc = 1;
+
+ ext->oe_urgent = 1;
+ if (ext->oe_state == OES_CACHE) {
+ OSC_EXTENT_DUMP(D_CACHE, ext,
+ "flush page %p make it urgent.\n", oap);
+ if (list_empty(&ext->oe_link))
+ list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+ unplug = true;
+ }
+ rc = 0;
+
+out:
+ osc_object_unlock(obj);
+ osc_extent_put(env, ext);
+ if (unplug)
+ osc_io_unplug_async(env, osc_cli(obj), obj);
+ return rc;
+}
+
+/**
+ * this is called when a sync waiter receives an interruption. Its job is to
+ * get the caller woken as soon as possible. If its page hasn't been put in an
+ * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as
+ * desiring interruption which will forcefully complete the rpc once the rpc
+ * has timed out.
+ */
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
+{
+ struct osc_async_page *oap = &ops->ops_oap;
+ struct osc_object *obj = oap->oap_obj;
+ struct client_obd *cli = osc_cli(obj);
+ struct osc_extent *ext;
+ struct osc_extent *found = NULL;
+ struct list_head *plist;
+ pgoff_t index = oap2cl_page(oap)->cp_index;
+ int rc = -EBUSY;
+ int cmd;
+
+ LASSERT(!oap->oap_interrupted);
+ oap->oap_interrupted = 1;
+
+ /* Find out the caching extent */
+ osc_object_lock(obj);
+ if (oap->oap_cmd & OBD_BRW_WRITE) {
+ plist = &obj->oo_urgent_exts;
+ cmd = OBD_BRW_WRITE;
+ } else {
+ plist = &obj->oo_reading_exts;
+ cmd = OBD_BRW_READ;
+ }
+ list_for_each_entry(ext, plist, oe_link) {
+ if (ext->oe_start <= index && ext->oe_end >= index) {
+ LASSERT(ext->oe_state == OES_LOCK_DONE);
+ /* For OES_LOCK_DONE state extent, it has already held
+ * a refcount for RPC. */
+ found = osc_extent_get(ext);
+ break;
+ }
+ }
+ if (found != NULL) {
+ list_del_init(&found->oe_link);
+ osc_update_pending(obj, cmd, -found->oe_nr_pages);
+ osc_object_unlock(obj);
+
+ osc_extent_finish(env, found, 0, -EINTR);
+ osc_extent_put(env, found);
+ rc = 0;
+ } else {
+ osc_object_unlock(obj);
+ /* ok, it's been put in an rpc. only one oap gets a request
+ * reference */
+ if (oap->oap_request != NULL) {
+ ptlrpc_mark_interrupted(oap->oap_request);
+ ptlrpcd_wake(oap->oap_request);
+ ptlrpc_req_finished(oap->oap_request);
+ oap->oap_request = NULL;
+ }
+ }
+
+ osc_list_maint(cli, obj);
+ return rc;
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+ struct list_head *list, int cmd, int brw_flags)
+{
+ struct client_obd *cli = osc_cli(obj);
+ struct osc_extent *ext;
+ struct osc_async_page *oap, *tmp;
+ int page_count = 0;
+ int mppr = cli->cl_max_pages_per_rpc;
+ pgoff_t start = CL_PAGE_EOF;
+ pgoff_t end = 0;
+
+ list_for_each_entry(oap, list, oap_pending_item) {
+ struct cl_page *cp = oap2cl_page(oap);
+ if (cp->cp_index > end)
+ end = cp->cp_index;
+ if (cp->cp_index < start)
+ start = cp->cp_index;
+ ++page_count;
+ mppr <<= (page_count > mppr);
+ }
+
+ ext = osc_extent_alloc(obj);
+ if (ext == NULL) {
+ list_for_each_entry_safe(oap, tmp, list, oap_pending_item) {
+ list_del_init(&oap->oap_pending_item);
+ osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+ }
+ return -ENOMEM;
+ }
+
+ ext->oe_rw = !!(cmd & OBD_BRW_READ);
+ ext->oe_urgent = 1;
+ ext->oe_start = start;
+ ext->oe_end = ext->oe_max_end = end;
+ ext->oe_obj = obj;
+ ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+ ext->oe_nr_pages = page_count;
+ ext->oe_mppr = mppr;
+ list_splice_init(list, &ext->oe_pages);
+
+ osc_object_lock(obj);
+ /* Reuse the initial refcount for RPC, don't drop it */
+ osc_extent_state_set(ext, OES_LOCK_DONE);
+ if (cmd & OBD_BRW_WRITE) {
+ list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+ osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+ } else {
+ list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+ osc_update_pending(obj, OBD_BRW_READ, page_count);
+ }
+ osc_object_unlock(obj);
+
+ osc_io_unplug_async(env, cli, obj);
+ return 0;
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+ struct osc_object *obj, __u64 size)
+{
+ struct client_obd *cli = osc_cli(obj);
+ struct osc_extent *ext;
+ struct osc_extent *waiting = NULL;
+ pgoff_t index;
+ LIST_HEAD(list);
+ int result = 0;
+ bool partial;
+
+ /* pages with index greater or equal to index will be truncated. */
+ index = cl_index(osc2cl(obj), size);
+ partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+ osc_object_lock(obj);
+ ext = osc_extent_search(obj, index);
+ if (ext == NULL)
+ ext = first_extent(obj);
+ else if (ext->oe_end < index)
+ ext = next_extent(ext);
+ while (ext != NULL) {
+ EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+ if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+ /* if ext is in urgent state, it means there must exist
+ * a page already having been flushed by write_page().
+ * We have to wait for this extent because we can't
+ * truncate that page. */
+ LASSERT(!ext->oe_hp);
+ OSC_EXTENT_DUMP(D_CACHE, ext,
+ "waiting for busy extent\n");
+ waiting = osc_extent_get(ext);
+ break;
+ }
+
+ OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size);
+
+ osc_extent_get(ext);
+ if (ext->oe_state == OES_ACTIVE) {
+ /* though we grab inode mutex for write path, but we
+ * release it before releasing extent(in osc_io_end()),
+ * so there is a race window that an extent is still
+ * in OES_ACTIVE when truncate starts. */
+ LASSERT(!ext->oe_trunc_pending);
+ ext->oe_trunc_pending = 1;
+ } else {
+ EASSERT(ext->oe_state == OES_CACHE, ext);
+ osc_extent_state_set(ext, OES_TRUNC);
+ osc_update_pending(obj, OBD_BRW_WRITE,
+ -ext->oe_nr_pages);
+ }
+ EASSERT(list_empty(&ext->oe_link), ext);
+ list_add_tail(&ext->oe_link, &list);
+
+ ext = next_extent(ext);
+ }
+ osc_object_unlock(obj);
+
+ osc_list_maint(cli, obj);
+
+ while (!list_empty(&list)) {
+ int rc;
+
+ ext = list_entry(list.next, struct osc_extent, oe_link);
+ list_del_init(&ext->oe_link);
+
+ /* extent may be in OES_ACTIVE state because inode mutex
+ * is released before osc_io_end() in file write case */
+ if (ext->oe_state != OES_TRUNC)
+ osc_extent_wait(env, ext, OES_TRUNC);
+
+ rc = osc_extent_truncate(ext, index, partial);
+ if (rc < 0) {
+ if (result == 0)
+ result = rc;
+
+ OSC_EXTENT_DUMP(D_ERROR, ext,
+ "truncate error %d\n", rc);
+ } else if (ext->oe_nr_pages == 0) {
+ osc_extent_remove(ext);
+ } else {
+ /* this must be an overlapped extent which means only
+ * part of pages in this extent have been truncated.
+ */
+ EASSERTF(ext->oe_start <= index, ext,
+ "trunc index = %lu/%d.\n", index, partial);
+ /* fix index to skip this partially truncated extent */
+ index = ext->oe_end + 1;
+ partial = false;
+
+ /* we need to hold this extent in OES_TRUNC state so
+ * that no writeback will happen. This is to avoid
+ * BUG 17397. */
+ LASSERT(oio->oi_trunc == NULL);
+ oio->oi_trunc = osc_extent_get(ext);
+ OSC_EXTENT_DUMP(D_CACHE, ext,
+ "trunc at %llu\n", size);
+ }
+ osc_extent_put(env, ext);
+ }
+ if (waiting != NULL) {
+ int rc;
+
+ /* ignore the result of osc_extent_wait the write initiator
+ * should take care of it. */
+ rc = osc_extent_wait(env, waiting, OES_INV);
+ if (rc < 0)
+ OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc);
+
+ osc_extent_put(env, waiting);
+ waiting = NULL;
+ goto again;
+ }
+ return result;
+}
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+ struct osc_object *obj)
+{
+ struct osc_extent *ext = oio->oi_trunc;
+
+ oio->oi_trunc = NULL;
+ if (ext != NULL) {
+ bool unplug = false;
+
+ EASSERT(ext->oe_nr_pages > 0, ext);
+ EASSERT(ext->oe_state == OES_TRUNC, ext);
+ EASSERT(!ext->oe_urgent, ext);
+
+ OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+ osc_object_lock(obj);
+ osc_extent_state_set(ext, OES_CACHE);
+ if (ext->oe_fsync_wait && !ext->oe_urgent) {
+ ext->oe_urgent = 1;
+ list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+ unplug = true;
+ }
+ osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+ osc_object_unlock(obj);
+ osc_extent_put(env, ext);
+
+ if (unplug)
+ osc_io_unplug_async(env, osc_cli(obj), obj);
+ }
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+ pgoff_t start, pgoff_t end)
+{
+ struct osc_extent *ext;
+ pgoff_t index = start;
+ int result = 0;
+
+again:
+ osc_object_lock(obj);
+ ext = osc_extent_search(obj, index);
+ if (ext == NULL)
+ ext = first_extent(obj);
+ else if (ext->oe_end < index)
+ ext = next_extent(ext);
+ while (ext != NULL) {
+ int rc;
+
+ if (ext->oe_start > end)
+ break;
+
+ if (!ext->oe_fsync_wait) {
+ ext = next_extent(ext);
+ continue;
+ }
+
+ EASSERT(ergo(ext->oe_state == OES_CACHE,
+ ext->oe_hp || ext->oe_urgent), ext);
+ EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+ !ext->oe_hp && ext->oe_urgent), ext);
+
+ index = ext->oe_end + 1;
+ osc_extent_get(ext);
+ osc_object_unlock(obj);
+
+ rc = osc_extent_wait(env, ext, OES_INV);
+ if (result == 0)
+ result = rc;
+ osc_extent_put(env, ext);
+ goto again;
+ }
+ osc_object_unlock(obj);
+
+ OSC_IO_DEBUG(obj, "sync file range.\n");
+ return result;
+}
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ * truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+ pgoff_t start, pgoff_t end, int hp, int discard)
+{
+ struct osc_extent *ext;
+ LIST_HEAD(discard_list);
+ bool unplug = false;
+ int result = 0;
+
+ osc_object_lock(obj);
+ ext = osc_extent_search(obj, start);
+ if (ext == NULL)
+ ext = first_extent(obj);
+ else if (ext->oe_end < start)
+ ext = next_extent(ext);
+ while (ext != NULL) {
+ if (ext->oe_start > end)
+ break;
+
+ ext->oe_fsync_wait = 1;
+ switch (ext->oe_state) {
+ case OES_CACHE:
+ result += ext->oe_nr_pages;
+ if (!discard) {
+ struct list_head *list = NULL;
+ if (hp) {
+ EASSERT(!ext->oe_hp, ext);
+ ext->oe_hp = 1;
+ list = &obj->oo_hp_exts;
+ } else if (!ext->oe_urgent) {
+ ext->oe_urgent = 1;
+ list = &obj->oo_urgent_exts;
+ }
+ if (list != NULL)
+ list_move_tail(&ext->oe_link, list);
+ unplug = true;
+ } else {
+ /* the only discarder is lock cancelling, so
+ * [start, end] must contain this extent */
+ EASSERT(ext->oe_start >= start &&
+ ext->oe_max_end <= end, ext);
+ osc_extent_state_set(ext, OES_LOCKING);
+ ext->oe_owner = current;
+ list_move_tail(&ext->oe_link,
+ &discard_list);
+ osc_update_pending(obj, OBD_BRW_WRITE,
+ -ext->oe_nr_pages);
+ }
+ break;
+ case OES_ACTIVE:
+ /* It's pretty bad to wait for ACTIVE extents, because
+ * we don't know how long we will wait for it to be
+ * flushed since it may be blocked at awaiting more
+ * grants. We do this for the correctness of fsync. */
+ LASSERT(hp == 0 && discard == 0);
+ ext->oe_urgent = 1;
+ break;
+ case OES_TRUNC:
+ /* this extent is being truncated, can't do anything
+ * for it now. it will be set to urgent after truncate
+ * is finished in osc_cache_truncate_end(). */
+ default:
+ break;
+ }
+ ext = next_extent(ext);
+ }
+ osc_object_unlock(obj);
+
+ LASSERT(ergo(!discard, list_empty(&discard_list)));
+ if (!list_empty(&discard_list)) {
+ struct osc_extent *tmp;
+ int rc;
+
+ osc_list_maint(osc_cli(obj), obj);
+ list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+ list_del_init(&ext->oe_link);
+ EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+ /* Discard caching pages. We don't actually write this
+ * extent out but we complete it as if we did. */
+ rc = osc_extent_make_ready(env, ext);
+ if (unlikely(rc < 0)) {
+ OSC_EXTENT_DUMP(D_ERROR, ext,
+ "make_ready returned %d\n", rc);
+ if (result >= 0)
+ result = rc;
+ }
+
+ /* finish the extent as if the pages were sent */
+ osc_extent_finish(env, ext, 0, 0);
+ }
+ }
+
+ if (unplug)
+ osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND);
+
+ if (hp || discard) {
+ int rc;
+ rc = osc_cache_wait_range(env, obj, start, end);
+ if (result >= 0 && rc < 0)
+ result = rc;
+ }
+
+ OSC_IO_DEBUG(obj, "cache page out.\n");
+ return result;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
new file mode 100644
index 000000000..365b2787b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
@@ -0,0 +1,685 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd.h"
+/* osc_build_res_name() */
+#include "../include/cl_object.h"
+#include "../include/lclient.h"
+#include "osc_internal.h"
+
+/** \defgroup osc osc
+ * @{
+ */
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+ /** super class */
+ struct cl_io_slice oi_cl;
+ /** true if this io is lockless. */
+ int oi_lockless;
+ /** active extents, we know how many bytes is going to be written,
+ * so having an active extent will prevent it from being fragmented */
+ struct osc_extent *oi_active;
+ /** partially truncated extent, we need to hold this extent to prevent
+ * page writeback from happening. */
+ struct osc_extent *oi_trunc;
+
+ struct obd_info oi_info;
+ struct obdo oi_oa;
+ struct osc_async_cbargs {
+ bool opc_rpc_sent;
+ int opc_rc;
+ struct completion opc_sync;
+ } oi_cbarg;
+};
+
+/**
+ * State of transfer for osc.
+ */
+struct osc_req {
+ struct cl_req_slice or_cl;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+ struct osc_io os_io;
+};
+
+#define OTI_PVEC_SIZE 64
+struct osc_thread_info {
+ struct ldlm_res_id oti_resname;
+ ldlm_policy_data_t oti_policy;
+ struct cl_lock_descr oti_descr;
+ struct cl_attr oti_attr;
+ struct lustre_handle oti_handle;
+ struct cl_page_list oti_plist;
+ struct cl_io oti_io;
+ struct cl_page *oti_pvec[OTI_PVEC_SIZE];
+};
+
+struct osc_object {
+ struct cl_object oo_cl;
+ struct lov_oinfo *oo_oinfo;
+ /**
+ * True if locking against this stripe got -EUSERS.
+ */
+ int oo_contended;
+ unsigned long oo_contention_time;
+ /**
+ * List of pages in transfer.
+ */
+ struct list_head oo_inflight[CRT_NR];
+ /**
+ * Lock, protecting ccc_object::cob_inflight, because a seat-belt is
+ * locked during take-off and landing.
+ */
+ spinlock_t oo_seatbelt;
+
+ /**
+ * used by the osc to keep track of what objects to build into rpcs.
+ * Protected by client_obd->cli_loi_list_lock.
+ */
+ struct list_head oo_ready_item;
+ struct list_head oo_hp_ready_item;
+ struct list_head oo_write_item;
+ struct list_head oo_read_item;
+
+ /**
+ * extent is a red black tree to manage (async) dirty pages.
+ */
+ struct rb_root oo_root;
+ /**
+ * Manage write(dirty) extents.
+ */
+ struct list_head oo_hp_exts; /* list of hp extents */
+ struct list_head oo_urgent_exts; /* list of writeback extents */
+ struct list_head oo_rpc_exts;
+
+ struct list_head oo_reading_exts;
+
+ atomic_t oo_nr_reads;
+ atomic_t oo_nr_writes;
+
+ /** Protect extent tree. Will be used to protect
+ * oo_{read|write}_pages soon. */
+ spinlock_t oo_lock;
+};
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+ spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+ return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+ spin_unlock(&obj->oo_lock);
+}
+
+static inline int osc_object_is_locked(struct osc_object *obj)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+ return spin_is_locked(&obj->oo_lock);
+#else
+ /*
+ * It is not perfect to return true all the time.
+ * But since this function is only used for assertion
+ * and checking, it seems OK.
+ */
+ return 1;
+#endif
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+ OLS_NEW,
+ OLS_ENQUEUED,
+ OLS_UPCALL_RECEIVED,
+ OLS_GRANTED,
+ OLS_RELEASED,
+ OLS_BLOCKED,
+ OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode).
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ * - ldlm_lock_create()
+ * - ldlm_lock_new(): initializes a lock with 2 references. One for
+ * the caller (released when reply from the server is received, or on
+ * error), and another for the hash table.
+ * - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ * - ldlm_cli_enqueue_fini()
+ * - LDLM_LOCK_PUT(): releases caller reference acquired by
+ * ldlm_lock_new().
+ * - if (rc != 0)
+ * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ * - ldlm_lock_destroy()
+ * - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ * ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+ struct cl_lock_slice ols_cl;
+ /** underlying DLM lock */
+ struct ldlm_lock *ols_lock;
+ /** lock value block */
+ struct ost_lvb ols_lvb;
+ /** DLM flags with which osc_lock::ols_lock was enqueued */
+ __u64 ols_flags;
+ /** osc_lock::ols_lock handle */
+ struct lustre_handle ols_handle;
+ struct ldlm_enqueue_info ols_einfo;
+ enum osc_lock_state ols_state;
+
+ /**
+ * How many pages are using this lock for io, currently only used by
+ * read-ahead. If non-zero, the underlying dlm lock won't be cancelled
+ * during recovery to avoid deadlock. see bz16774.
+ *
+ * \see osc_page::ops_lock
+ * \see osc_page_addref_lock(), osc_page_putref_lock()
+ */
+ atomic_t ols_pageref;
+
+ /**
+ * true, if ldlm_lock_addref() was called against
+ * osc_lock::ols_lock. This is used for sanity checking.
+ *
+ * \see osc_lock::ols_has_ref
+ */
+ unsigned ols_hold :1,
+ /**
+ * this is much like osc_lock::ols_hold, except that this bit is
+ * cleared _after_ reference in released in osc_lock_unuse(). This
+ * fine distinction is needed because:
+ *
+ * - if ldlm lock still has a reference, osc_ast_data_get() needs
+ * to return associated cl_lock (so that a flag is needed that is
+ * cleared after ldlm_lock_decref() returned), and
+ *
+ * - ldlm_lock_decref() can invoke blocking ast (for a
+ * LDLM_FL_CBPENDING lock), and osc_lock functions like
+ * osc_lock_cancel() called from there need to know whether to
+ * release lock reference (so that a flag is needed that is
+ * cleared before ldlm_lock_decref() is called).
+ */
+ ols_has_ref:1,
+ /**
+ * inherit the lockless attribute from top level cl_io.
+ * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+ */
+ ols_locklessable:1,
+ /**
+ * set by osc_lock_use() to wait until blocking AST enters into
+ * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for
+ * further synchronization.
+ */
+ ols_ast_wait:1,
+ /**
+ * If the data of this lock has been flushed to server side.
+ */
+ ols_flush:1,
+ /**
+ * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+ * the EVAVAIL error as tolerable, this will make upper logic happy
+ * to wait all glimpse locks to each OSTs to be completed.
+ * Glimpse lock converts to normal lock if the server lock is
+ * granted.
+ * Glimpse lock should be destroyed immediately after use.
+ */
+ ols_glimpse:1,
+ /**
+ * For async glimpse lock.
+ */
+ ols_agl:1;
+ /**
+ * IO that owns this lock. This field is used for a dead-lock
+ * avoidance by osc_lock_enqueue_wait().
+ *
+ * XXX: unfortunately, the owner of a osc_lock is not unique,
+ * the lock may have multiple users, if the lock is granted and
+ * then matched.
+ */
+ struct osc_io *ols_owner;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+ struct cl_page_slice ops_cl;
+ /**
+ * Page queues used by osc to detect when RPC can be formed.
+ */
+ struct osc_async_page ops_oap;
+ /**
+ * An offset within page from which next transfer starts. This is used
+ * by cl_page_clip() to submit partial page transfers.
+ */
+ int ops_from;
+ /**
+ * An offset within page at which next transfer ends.
+ *
+ * \see osc_page::ops_from.
+ */
+ int ops_to;
+ /**
+ * Boolean, true iff page is under transfer. Used for sanity checking.
+ */
+ unsigned ops_transfer_pinned:1,
+ /**
+ * True for a `temporary page' created by read-ahead code, probably
+ * outside of any DLM lock.
+ */
+ ops_temp:1,
+ /**
+ * in LRU?
+ */
+ ops_in_lru:1,
+ /**
+ * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+ */
+ ops_srvlock:1;
+ union {
+ /**
+ * lru page list. ops_inflight and ops_lru are exclusive so
+ * that they can share the same data.
+ */
+ struct list_head ops_lru;
+ /**
+ * Linkage into a per-osc_object list of pages in flight. For
+ * debugging.
+ */
+ struct list_head ops_inflight;
+ };
+ /**
+ * Thread that submitted this page for transfer. For debugging.
+ */
+ struct task_struct *ops_submitter;
+ /**
+ * Submit time - the time when the page is starting RPC. For debugging.
+ */
+ unsigned long ops_submit_time;
+
+ /**
+ * A lock of which we hold a reference covers this page. Only used by
+ * read-ahead: for a readahead page, we hold it's covering lock to
+ * prevent it from being canceled during recovery.
+ *
+ * \see osc_lock::ols_pageref
+ * \see osc_page_addref_lock(), osc_page_putref_lock().
+ */
+ struct cl_lock *ops_lock;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_req_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *io);
+int osc_io_init (const struct lu_env *env,
+ struct cl_object *obj, struct cl_io *io);
+int osc_req_init (const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *hdr,
+ struct lu_device *dev);
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage);
+
+void osc_index2policy (ldlm_policy_data_t *policy, const struct cl_object *obj,
+ pgoff_t start, pgoff_t end);
+int osc_lvb_print (const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct ost_lvb *lvb);
+
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+ enum cl_req_type crt, int brw_flags);
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+ u32 async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+ struct page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+ struct osc_page *ops);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+ struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+ struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+ struct list_head *list, int cmd, int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+ struct osc_object *obj, __u64 size);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+ struct osc_object *obj);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+ pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+ pgoff_t start, pgoff_t end);
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+ struct osc_object *osc, pdl_policy_t pol);
+
+void osc_object_set_contended (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int osc_object_is_contended (struct osc_object *obj);
+
+int osc_lock_is_lockless (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+ struct osc_thread_info *info;
+
+ info = lu_context_key_get(&env->le_ctx, &osc_key);
+ LASSERT(info != NULL);
+ return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+ struct osc_session *ses;
+
+ ses = lu_context_key_get(env->le_ses, &osc_session_key);
+ LASSERT(ses != NULL);
+ return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+ return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+ return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+ LINVRNT(d->ld_type == &osc_device_type);
+ return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+ return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+ return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+ LINVRNT(osc_is_object(&obj->co_lu));
+ return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+ return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+ LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+ if (mode == CLM_READ)
+ return LCK_PR;
+ else if (mode == CLM_WRITE)
+ return LCK_PW;
+ else
+ return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
+{
+ LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+ if (mode == LCK_PR)
+ return CLM_READ;
+ else if (mode == LCK_PW)
+ return CLM_WRITE;
+ else
+ return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+ LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+ return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+ return container_of0(oap, struct osc_page, ops_oap);
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+ return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+ return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+ LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+ return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+ return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+ return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+ OES_INV = 0, /** extent is just initialized or destroyed */
+ OES_ACTIVE = 1, /** process is using this extent */
+ OES_CACHE = 2, /** extent is ready for IO */
+ OES_LOCKING = 3, /** locking page to prepare IO */
+ OES_LOCK_DONE = 4, /** locking finished, ready to send */
+ OES_RPC = 5, /** in RPC */
+ OES_TRUNC = 6, /** being truncated */
+ OES_STATE_MAX
+};
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+ /** red-black tree node */
+ struct rb_node oe_node;
+ /** osc_object of this extent */
+ struct osc_object *oe_obj;
+ /** refcount, removed from red-black tree if reaches zero. */
+ atomic_t oe_refc;
+ /** busy if non-zero */
+ atomic_t oe_users;
+ /** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+ struct list_head oe_link;
+ /** state of this extent */
+ unsigned int oe_state;
+ /** flags for this extent. */
+ unsigned int oe_intree:1,
+ /** 0 is write, 1 is read */
+ oe_rw:1,
+ oe_srvlock:1,
+ oe_memalloc:1,
+ /** an ACTIVE extent is going to be truncated, so when this extent
+ * is released, it will turn into TRUNC state instead of CACHE. */
+ oe_trunc_pending:1,
+ /** this extent should be written asap and someone may wait for the
+ * write to finish. This bit is usually set along with urgent if
+ * the extent was CACHE state.
+ * fsync_wait extent can't be merged because new extent region may
+ * exceed fsync range. */
+ oe_fsync_wait:1,
+ /** covering lock is being canceled */
+ oe_hp:1,
+ /** this extent should be written back asap. set if one of pages is
+ * called by page WB daemon, or sync write or reading requests. */
+ oe_urgent:1;
+ /** how many grants allocated for this extent.
+ * Grant allocated for this extent. There is no grant allocated
+ * for reading extents and sync write extents. */
+ unsigned int oe_grants;
+ /** # of dirty pages in this extent */
+ unsigned int oe_nr_pages;
+ /** list of pending oap pages. Pages in this list are NOT sorted. */
+ struct list_head oe_pages;
+ /** Since an extent has to be written out in atomic, this is used to
+ * remember the next page need to be locked to write this extent out.
+ * Not used right now.
+ */
+ struct osc_page *oe_next_page;
+ /** start and end index of this extent, include start and end
+ * themselves. Page offset here is the page index of osc_pages.
+ * oe_start is used as keyword for red-black tree. */
+ pgoff_t oe_start;
+ pgoff_t oe_end;
+ /** maximum ending index of this extent, this is limited by
+ * max_pages_per_rpc, lock extent and chunk size. */
+ pgoff_t oe_max_end;
+ /** waitqueue - for those who want to be notified if this extent's
+ * state has changed. */
+ wait_queue_head_t oe_waitq;
+ /** lock covering this extent */
+ struct cl_lock *oe_osclock;
+ /** terminator of this extent. Must be true if this extent is in IO. */
+ struct task_struct *oe_owner;
+ /** return value of writeback. If somebody is waiting for this extent,
+ * this value can be known by outside world. */
+ int oe_rc;
+ /** max pages per rpc when this extent was created */
+ unsigned int oe_mppr;
+};
+
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+ int sent, int rc);
+void osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c
new file mode 100644
index 000000000..4935fc7c0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_dev.c
@@ -0,0 +1,262 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, cl_req for OSC layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include "../include/obd_class.h"
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+struct kmem_cache *osc_lock_kmem;
+struct kmem_cache *osc_object_kmem;
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_req_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+ {
+ .ckd_cache = &osc_lock_kmem,
+ .ckd_name = "osc_lock_kmem",
+ .ckd_size = sizeof(struct osc_lock)
+ },
+ {
+ .ckd_cache = &osc_object_kmem,
+ .ckd_name = "osc_object_kmem",
+ .ckd_size = sizeof(struct osc_object)
+ },
+ {
+ .ckd_cache = &osc_thread_kmem,
+ .ckd_name = "osc_thread_kmem",
+ .ckd_size = sizeof(struct osc_thread_info)
+ },
+ {
+ .ckd_cache = &osc_session_kmem,
+ .ckd_name = "osc_session_kmem",
+ .ckd_size = sizeof(struct osc_session)
+ },
+ {
+ .ckd_cache = &osc_req_kmem,
+ .ckd_name = "osc_req_kmem",
+ .ckd_size = sizeof(struct osc_req)
+ },
+ {
+ .ckd_cache = &osc_extent_kmem,
+ .ckd_name = "osc_extent_kmem",
+ .ckd_size = sizeof(struct osc_extent)
+ },
+ {
+ .ckd_cache = &osc_quota_kmem,
+ .ckd_name = "osc_quota_kmem",
+ .ckd_size = sizeof(struct osc_quota_info)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+struct lock_class_key osc_ast_guard_class;
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+ return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct osc_thread_info *info;
+
+ OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, GFP_NOFS);
+ if (info == NULL)
+ info = ERR_PTR(-ENOMEM);
+ return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct osc_thread_info *info = data;
+
+ OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+ .lct_tags = LCT_CL_THREAD,
+ .lct_init = osc_key_init,
+ .lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct osc_session *info;
+
+ OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, GFP_NOFS);
+ if (info == NULL)
+ info = ERR_PTR(-ENOMEM);
+ return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct osc_session *info = data;
+
+ OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+ .lct_tags = LCT_SESSION,
+ .lct_init = osc_session_init,
+ .lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+ struct lu_device *d, struct lustre_cfg *cfg)
+{
+ return osc_process_config_base(d->ld_obd, cfg);
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+ .ldo_object_alloc = osc_object_alloc,
+ .ldo_process_config = osc_cl_process_config,
+ .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations osc_cl_ops = {
+ .cdo_req_init = osc_req_init
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+ const char *name, struct lu_device *next)
+{
+ return 0;
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+ struct lu_device *d)
+{
+ return NULL;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+ struct lu_device *d)
+{
+ struct osc_device *od = lu2osc_dev(d);
+
+ cl_device_fini(lu2cl_dev(d));
+ OBD_FREE_PTR(od);
+ return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+ struct lu_device_type *t,
+ struct lustre_cfg *cfg)
+{
+ struct lu_device *d;
+ struct osc_device *od;
+ struct obd_device *obd;
+ int rc;
+
+ OBD_ALLOC_PTR(od);
+ if (od == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ cl_device_init(&od->od_cl, t);
+ d = osc2lu_dev(od);
+ d->ld_ops = &osc_lu_ops;
+ od->od_cl.cd_ops = &osc_cl_ops;
+
+ /* Setup OSC OBD */
+ obd = class_name2obd(lustre_cfg_string(cfg, 0));
+ LASSERT(obd != NULL);
+ rc = osc_setup(obd, cfg);
+ if (rc) {
+ osc_device_free(env, d);
+ return ERR_PTR(rc);
+ }
+ od->od_exp = obd->obd_self_export;
+ return d;
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+ .ldto_init = osc_type_init,
+ .ldto_fini = osc_type_fini,
+
+ .ldto_start = osc_type_start,
+ .ldto_stop = osc_type_stop,
+
+ .ldto_device_alloc = osc_device_alloc,
+ .ldto_device_free = osc_device_free,
+
+ .ldto_device_init = osc_device_init,
+ .ldto_device_fini = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+ .ldt_tags = LU_DEVICE_CL,
+ .ldt_name = LUSTRE_OSC_NAME,
+ .ldt_ops = &osc_device_type_ops,
+ .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
new file mode 100644
index 000000000..af96c7bc7
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_internal.h
@@ -0,0 +1,203 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+struct lu_env;
+
+enum async_flags {
+ ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+ page is added to an rpc */
+ ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+ ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+ to give the caller a chance to update
+ or cancel the size of the io */
+ ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+ int oap_magic;
+ unsigned short oap_cmd;
+ unsigned short oap_interrupted:1;
+
+ struct list_head oap_pending_item;
+ struct list_head oap_rpc_item;
+
+ u64 oap_obj_off;
+ unsigned oap_page_off;
+ enum async_flags oap_async_flags;
+
+ struct brw_page oap_brw_page;
+
+ struct ptlrpc_request *oap_request;
+ struct client_obd *oap_cli;
+ struct osc_object *oap_obj;
+
+ struct ldlm_lock *oap_ldlm_lock;
+ spinlock_t oap_lock;
+};
+
+#define oap_page oap_brw_page.pg
+#define oap_count oap_brw_page.count
+#define oap_brw_flags oap_brw_page.flag
+
+struct osc_cache_waiter {
+ struct list_head ocw_entry;
+ wait_queue_head_t ocw_waitq;
+ struct osc_async_page *ocw_oap;
+ int ocw_grant;
+ int ocw_rc;
+};
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md **ea,
+ struct obd_trans_info *oti);
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+ struct lov_stripe_md **ea, struct obd_trans_info *oti);
+void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_update_next_shrink(struct client_obd *cli);
+
+/*
+ * cl integration.
+ */
+#include "../include/cl_object.h"
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+ __u64 *flags, ldlm_policy_data_t *policy,
+ struct ost_lvb *lvb, int kms_valid,
+ obd_enqueue_update_f upcall,
+ void *cookie, struct ldlm_enqueue_info *einfo,
+ struct lustre_handle *lockh,
+ struct ptlrpc_request_set *rqset, int async, int agl);
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+ __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+ __u64 *flags, void *data, struct lustre_handle *lockh,
+ int unref);
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ obd_enqueue_update_f upcall, void *cookie,
+ struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+ obd_enqueue_update_f upcall, void *cookie,
+ struct ptlrpc_request_set *rqset);
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+ obd_enqueue_update_f upcall, void *cookie,
+ struct ptlrpc_request_set *rqset);
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+ struct list_head *ext_list, int cmd, pdl_policy_t p);
+int osc_lru_shrink(struct client_obd *cli, int target);
+
+extern spinlock_t osc_ast_guard;
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#if defined (CONFIG_PROC_FS)
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+ memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+extern struct lu_device_type osc_device_type;
+
+static inline int osc_recoverable_error(int rc)
+{
+ return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+ rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+ return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+struct osc_device {
+ struct cl_device od_cl;
+ struct obd_export *od_exp;
+
+ /* Write stats is actually protected by client_obd's lock. */
+ struct osc_stats {
+ uint64_t os_lockless_writes; /* by bytes */
+ uint64_t os_lockless_reads; /* by bytes */
+ uint64_t os_lockless_truncates; /* by times */
+ } od_stats;
+
+ /* configuration item(s) */
+ int od_contention_time;
+ int od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+ return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm);
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+ /** linkage for quota hash table */
+ struct hlist_node oqi_hash;
+ u32 oqi_id;
+};
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+ u32 valid, u32 flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl);
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl);
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+#endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
new file mode 100644
index 000000000..3c7300b06
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_io.c
@@ -0,0 +1,819 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_req *cl2osc_req(const struct cl_req_slice *slice)
+{
+ LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type);
+ return container_of0(slice, struct osc_req, or_cl);
+}
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+
+ LINVRNT(oio == osc_env_io(env));
+ return oio;
+}
+
+static struct osc_page *osc_cl_page_osc(struct cl_page *page)
+{
+ const struct cl_page_slice *slice;
+
+ slice = cl_page_at(page, &osc_device_type);
+ LASSERT(slice != NULL);
+
+ return cl2osc_page(slice);
+}
+
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+static int osc_io_submit(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ enum cl_req_type crt, struct cl_2queue *queue)
+{
+ struct cl_page *page;
+ struct cl_page *tmp;
+ struct client_obd *cli = NULL;
+ struct osc_object *osc = NULL; /* to keep gcc happy */
+ struct osc_page *opg;
+ struct cl_io *io;
+ LIST_HEAD(list);
+
+ struct cl_page_list *qin = &queue->c2_qin;
+ struct cl_page_list *qout = &queue->c2_qout;
+ int queued = 0;
+ int result = 0;
+ int cmd;
+ int brw_flags;
+ int max_pages;
+
+ LASSERT(qin->pl_nr > 0);
+
+ CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt);
+
+ osc = cl2osc(ios->cis_obj);
+ cli = osc_cli(osc);
+ max_pages = cli->cl_max_pages_per_rpc;
+
+ cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+ brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+
+ /*
+ * NOTE: here @page is a top-level page. This is done to avoid
+ * creation of sub-page-list.
+ */
+ cl_page_list_for_each_safe(page, tmp, qin) {
+ struct osc_async_page *oap;
+
+ /* Top level IO. */
+ io = page->cp_owner;
+ LASSERT(io != NULL);
+
+ opg = osc_cl_page_osc(page);
+ oap = &opg->ops_oap;
+ LASSERT(osc == oap->oap_obj);
+
+ if (!list_empty(&oap->oap_pending_item) ||
+ !list_empty(&oap->oap_rpc_item)) {
+ CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+ oap, opg);
+ result = -EBUSY;
+ break;
+ }
+
+ result = cl_page_prep(env, io, page, crt);
+ if (result != 0) {
+ LASSERT(result < 0);
+ if (result != -EALREADY)
+ break;
+ /*
+ * Handle -EALREADY error: for read case, the page is
+ * already in UPTODATE state; for write, the page
+ * is not dirty.
+ */
+ result = 0;
+ continue;
+ }
+
+ cl_page_list_move(qout, qin, page);
+ oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+ oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+
+ osc_page_submit(env, opg, crt, brw_flags);
+ list_add_tail(&oap->oap_pending_item, &list);
+ if (++queued == max_pages) {
+ queued = 0;
+ result = osc_queue_sync_pages(env, osc, &list, cmd,
+ brw_flags);
+ if (result < 0)
+ break;
+ }
+ }
+
+ if (queued > 0)
+ result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+
+ CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+ return qout->pl_nr > 0 ? 0 : result;
+}
+
+static void osc_page_touch_at(const struct lu_env *env,
+ struct cl_object *obj, pgoff_t idx, unsigned to)
+{
+ struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+ struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+ int valid;
+ __u64 kms;
+
+ /* offset within stripe */
+ kms = cl_offset(obj, idx) + to;
+
+ cl_object_attr_lock(obj);
+ /*
+ * XXX old code used
+ *
+ * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+ *
+ * here
+ */
+ CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n",
+ kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+ loi->loi_lvb.lvb_size);
+
+ valid = 0;
+ if (kms > loi->loi_kms) {
+ attr->cat_kms = kms;
+ valid |= CAT_KMS;
+ }
+ if (kms > loi->loi_lvb.lvb_size) {
+ attr->cat_size = kms;
+ valid |= CAT_SIZE;
+ }
+ cl_object_attr_set(env, obj, attr, valid);
+ cl_object_attr_unlock(obj);
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size). Examples:
+ * ->commit_write() and ->nopage() methods.
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch(const struct lu_env *env,
+ struct osc_page *opage, unsigned to)
+{
+ struct cl_page *page = opage->ops_cl.cpl_page;
+ struct cl_object *obj = opage->ops_cl.cpl_obj;
+
+ osc_page_touch_at(env, obj, page->cp_index, to);
+}
+
+/**
+ * Implements cl_io_operations::cio_prepare_write() method for osc layer.
+ *
+ * \retval -EIO transfer initiated against this osc will most likely fail
+ * \retval 0 transfer initiated against this osc will most likely succeed.
+ *
+ * The reason for this check is to immediately return an error to the caller
+ * in the case of a deactivated import. Note, that import can be deactivated
+ * later, while pages, dirtied by this IO, are still in the cache, but this is
+ * irrelevant, because that would still return an error to the application (if
+ * it does fsync), but many applications don't do fsync because of performance
+ * issues, and we wanted to return an -EIO at write time to notify the
+ * application.
+ */
+static int osc_io_prepare_write(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ const struct cl_page_slice *slice,
+ unsigned from, unsigned to)
+{
+ struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
+ struct obd_import *imp = class_exp2cliimp(dev->od_exp);
+ struct osc_io *oio = cl2osc_io(env, ios);
+ int result = 0;
+
+ /*
+ * This implements OBD_BRW_CHECK logic from old client.
+ */
+
+ if (imp == NULL || imp->imp_invalid)
+ result = -EIO;
+ if (result == 0 && oio->oi_lockless)
+ /* this page contains `invalid' data, but who cares?
+ * nobody can access the invalid data.
+ * in osc_io_commit_write(), we're going to write exact
+ * [from, to) bytes of this page to OST. -jay */
+ cl_page_export(env, slice->cpl_page, 1);
+
+ return result;
+}
+
+static int osc_io_commit_write(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ const struct cl_page_slice *slice,
+ unsigned from, unsigned to)
+{
+ struct osc_io *oio = cl2osc_io(env, ios);
+ struct osc_page *opg = cl2osc_page(slice);
+ struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+ struct osc_async_page *oap = &opg->ops_oap;
+
+ LASSERT(to > 0);
+ /*
+ * XXX instead of calling osc_page_touch() here and in
+ * osc_io_fault_start() it might be more logical to introduce
+ * cl_page_touch() method, that generic cl_io_commit_write() and page
+ * fault code calls.
+ */
+ osc_page_touch(env, cl2osc_page(slice), to);
+ if (!client_is_remote(osc_export(obj)) &&
+ capable(CFS_CAP_SYS_RESOURCE))
+ oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+
+ if (oio->oi_lockless)
+ /* see osc_io_prepare_write() for lockless io handling. */
+ cl_page_clip(env, slice->cpl_page, from, to);
+
+ return 0;
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct cl_io *io;
+ struct cl_fault_io *fio;
+
+ io = ios->cis_io;
+ fio = &io->u.ci_fault;
+ CDEBUG(D_INFO, "%lu %d %d\n",
+ fio->ft_index, fio->ft_writable, fio->ft_nob);
+ /*
+ * If mapping is writeable, adjust kms to cover this page,
+ * but do not extend kms beyond actual file size.
+ * See bug 10919.
+ */
+ if (fio->ft_writable)
+ osc_page_touch_at(env, ios->cis_obj,
+ fio->ft_index, fio->ft_nob);
+ return 0;
+}
+
+static int osc_async_upcall(void *a, int rc)
+{
+ struct osc_async_cbargs *args = a;
+
+ args->opc_rc = rc;
+ complete(&args->opc_sync);
+ return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ const struct cl_page_slice *slice;
+ struct osc_page *ops;
+ struct osc_async_page *oap;
+ __u64 start = *(__u64 *)cbdata;
+
+ slice = cl_page_at(page, &osc_device_type);
+ LASSERT(slice != NULL);
+ ops = cl2osc_page(slice);
+ oap = &ops->ops_oap;
+
+ if (oap->oap_cmd & OBD_BRW_WRITE &&
+ !list_empty(&oap->oap_pending_item))
+ CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n",
+ start, current->comm);
+
+ {
+ struct page *vmpage = cl_page_vmpage(env, page);
+
+ if (PageLocked(vmpage))
+ CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+ ops, page->cp_index,
+ (oap->oap_cmd & OBD_BRW_RWMASK));
+ }
+
+ return CLP_GANG_OKAY;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+ struct osc_io *oio, __u64 size)
+{
+ struct cl_object *clob;
+ int partial;
+ pgoff_t start;
+
+ clob = oio->oi_cl.cis_obj;
+ start = cl_index(clob, size);
+ partial = cl_offset(clob, start) < size;
+
+ /*
+ * Complain if there are pages in the truncated region.
+ */
+ cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF,
+ trunc_check_cb, (void *)&size);
+}
+
+static int osc_io_setattr_start(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct cl_io *io = slice->cis_io;
+ struct osc_io *oio = cl2osc_io(env, slice);
+ struct cl_object *obj = slice->cis_obj;
+ struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo;
+ struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+ struct obdo *oa = &oio->oi_oa;
+ struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+ __u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+ unsigned int ia_valid = io->u.ci_setattr.sa_valid;
+ int result = 0;
+ struct obd_info oinfo = { { { 0 } } };
+
+ /* truncate cache dirty pages first */
+ if (cl_io_is_trunc(io))
+ result = osc_cache_truncate_start(env, oio, cl2osc(obj), size);
+
+ if (result == 0 && oio->oi_lockless == 0) {
+ cl_object_attr_lock(obj);
+ result = cl_object_attr_get(env, obj, attr);
+ if (result == 0) {
+ struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+ unsigned int cl_valid = 0;
+
+ if (ia_valid & ATTR_SIZE) {
+ attr->cat_size = attr->cat_kms = size;
+ cl_valid = CAT_SIZE | CAT_KMS;
+ }
+ if (ia_valid & ATTR_MTIME_SET) {
+ attr->cat_mtime = lvb->lvb_mtime;
+ cl_valid |= CAT_MTIME;
+ }
+ if (ia_valid & ATTR_ATIME_SET) {
+ attr->cat_atime = lvb->lvb_atime;
+ cl_valid |= CAT_ATIME;
+ }
+ if (ia_valid & ATTR_CTIME_SET) {
+ attr->cat_ctime = lvb->lvb_ctime;
+ cl_valid |= CAT_CTIME;
+ }
+ result = cl_object_attr_set(env, obj, attr, cl_valid);
+ }
+ cl_object_attr_unlock(obj);
+ }
+ memset(oa, 0, sizeof(*oa));
+ if (result == 0) {
+ oa->o_oi = loi->loi_oi;
+ oa->o_mtime = attr->cat_mtime;
+ oa->o_atime = attr->cat_atime;
+ oa->o_ctime = attr->cat_ctime;
+ oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+ OBD_MD_FLCTIME | OBD_MD_FLMTIME;
+ if (ia_valid & ATTR_SIZE) {
+ oa->o_size = size;
+ oa->o_blocks = OBD_OBJECT_EOF;
+ oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+ if (oio->oi_lockless) {
+ oa->o_flags = OBD_FL_SRVLOCK;
+ oa->o_valid |= OBD_MD_FLFLAGS;
+ }
+ } else {
+ LASSERT(oio->oi_lockless == 0);
+ }
+
+ oinfo.oi_oa = oa;
+ oinfo.oi_capa = io->u.ci_setattr.sa_capa;
+ init_completion(&cbargs->opc_sync);
+
+ if (ia_valid & ATTR_SIZE)
+ result = osc_punch_base(osc_export(cl2osc(obj)),
+ &oinfo, osc_async_upcall,
+ cbargs, PTLRPCD_SET);
+ else
+ result = osc_setattr_async_base(osc_export(cl2osc(obj)),
+ &oinfo, NULL,
+ osc_async_upcall,
+ cbargs, PTLRPCD_SET);
+ cbargs->opc_rpc_sent = result == 0;
+ }
+ return result;
+}
+
+static void osc_io_setattr_end(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct cl_io *io = slice->cis_io;
+ struct osc_io *oio = cl2osc_io(env, slice);
+ struct cl_object *obj = slice->cis_obj;
+ struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+ int result = 0;
+
+ if (cbargs->opc_rpc_sent) {
+ wait_for_completion(&cbargs->opc_sync);
+ result = io->ci_result = cbargs->opc_rc;
+ }
+ if (result == 0) {
+ if (oio->oi_lockless) {
+ /* lockless truncate */
+ struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+ LASSERT(cl_io_is_trunc(io));
+ /* XXX: Need a lock. */
+ osd->od_stats.os_lockless_truncates++;
+ }
+ }
+
+ if (cl_io_is_trunc(io)) {
+ __u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+
+ osc_trunc_check(env, io, oio, size);
+ if (oio->oi_trunc != NULL) {
+ osc_cache_truncate_end(env, oio, cl2osc(obj));
+ oio->oi_trunc = NULL;
+ }
+ }
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct cl_object *obj = slice->cis_obj;
+ struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+ int rc = 0;
+
+ if (!slice->cis_io->ci_noatime) {
+ cl_object_attr_lock(obj);
+ attr->cat_atime = LTIME_S(CURRENT_TIME);
+ rc = cl_object_attr_set(env, obj, attr, CAT_ATIME);
+ cl_object_attr_unlock(obj);
+ }
+ return rc;
+}
+
+static int osc_io_write_start(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct cl_object *obj = slice->cis_obj;
+ struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+ int rc = 0;
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+ cl_object_attr_lock(obj);
+ attr->cat_mtime = attr->cat_ctime = LTIME_S(CURRENT_TIME);
+ rc = cl_object_attr_set(env, obj, attr, CAT_MTIME | CAT_CTIME);
+ cl_object_attr_unlock(obj);
+
+ return rc;
+}
+
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+ struct cl_fsync_io *fio)
+{
+ struct osc_io *oio = osc_env_io(env);
+ struct obdo *oa = &oio->oi_oa;
+ struct obd_info *oinfo = &oio->oi_info;
+ struct lov_oinfo *loi = obj->oo_oinfo;
+ struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+ int rc = 0;
+
+ memset(oa, 0, sizeof(*oa));
+ oa->o_oi = loi->loi_oi;
+ oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+ /* reload size abd blocks for start and end of sync range */
+ oa->o_size = fio->fi_start;
+ oa->o_blocks = fio->fi_end;
+ oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+ obdo_set_parent_fid(oa, fio->fi_fid);
+
+ memset(oinfo, 0, sizeof(*oinfo));
+ oinfo->oi_oa = oa;
+ oinfo->oi_capa = fio->fi_capa;
+ init_completion(&cbargs->opc_sync);
+
+ rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs,
+ PTLRPCD_SET);
+ return rc;
+}
+
+static int osc_io_fsync_start(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct cl_io *io = slice->cis_io;
+ struct cl_fsync_io *fio = &io->u.ci_fsync;
+ struct cl_object *obj = slice->cis_obj;
+ struct osc_object *osc = cl2osc(obj);
+ pgoff_t start = cl_index(obj, fio->fi_start);
+ pgoff_t end = cl_index(obj, fio->fi_end);
+ int result = 0;
+
+ if (fio->fi_end == OBD_OBJECT_EOF)
+ end = CL_PAGE_EOF;
+
+ result = osc_cache_writeback_range(env, osc, start, end, 0,
+ fio->fi_mode == CL_FSYNC_DISCARD);
+ if (result > 0) {
+ fio->fi_nr_written += result;
+ result = 0;
+ }
+ if (fio->fi_mode == CL_FSYNC_ALL) {
+ int rc;
+
+ /* we have to wait for writeback to finish before we can
+ * send OST_SYNC RPC. This is bad because it causes extents
+ * to be written osc by osc. However, we usually start
+ * writeback before CL_FSYNC_ALL so this won't have any real
+ * problem. */
+ rc = osc_cache_wait_range(env, osc, start, end);
+ if (result == 0)
+ result = rc;
+ rc = osc_fsync_ost(env, osc, fio);
+ if (result == 0)
+ result = rc;
+ }
+
+ return result;
+}
+
+static void osc_io_fsync_end(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+ struct cl_object *obj = slice->cis_obj;
+ pgoff_t start = cl_index(obj, fio->fi_start);
+ pgoff_t end = cl_index(obj, fio->fi_end);
+ int result = 0;
+
+ if (fio->fi_mode == CL_FSYNC_LOCAL) {
+ result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+ } else if (fio->fi_mode == CL_FSYNC_ALL) {
+ struct osc_io *oio = cl2osc_io(env, slice);
+ struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+ wait_for_completion(&cbargs->opc_sync);
+ if (result == 0)
+ result = cbargs->opc_rc;
+ }
+ slice->cis_io->ci_result = result;
+}
+
+static void osc_io_end(const struct lu_env *env,
+ const struct cl_io_slice *slice)
+{
+ struct osc_io *oio = cl2osc_io(env, slice);
+
+ if (oio->oi_active) {
+ osc_extent_release(env, oio->oi_active);
+ oio->oi_active = NULL;
+ }
+}
+
+static const struct cl_io_operations osc_io_ops = {
+ .op = {
+ [CIT_READ] = {
+ .cio_start = osc_io_read_start,
+ .cio_fini = osc_io_fini
+ },
+ [CIT_WRITE] = {
+ .cio_start = osc_io_write_start,
+ .cio_end = osc_io_end,
+ .cio_fini = osc_io_fini
+ },
+ [CIT_SETATTR] = {
+ .cio_start = osc_io_setattr_start,
+ .cio_end = osc_io_setattr_end
+ },
+ [CIT_FAULT] = {
+ .cio_start = osc_io_fault_start,
+ .cio_end = osc_io_end,
+ .cio_fini = osc_io_fini
+ },
+ [CIT_FSYNC] = {
+ .cio_start = osc_io_fsync_start,
+ .cio_end = osc_io_fsync_end,
+ .cio_fini = osc_io_fini
+ },
+ [CIT_MISC] = {
+ .cio_fini = osc_io_fini
+ }
+ },
+ .req_op = {
+ [CRT_READ] = {
+ .cio_submit = osc_io_submit
+ },
+ [CRT_WRITE] = {
+ .cio_submit = osc_io_submit
+ }
+ },
+ .cio_prepare_write = osc_io_prepare_write,
+ .cio_commit_write = osc_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+static int osc_req_prep(const struct lu_env *env,
+ const struct cl_req_slice *slice)
+{
+ return 0;
+}
+
+static void osc_req_completion(const struct lu_env *env,
+ const struct cl_req_slice *slice, int ioret)
+{
+ struct osc_req *or;
+
+ or = cl2osc_req(slice);
+ OBD_SLAB_FREE_PTR(or, osc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env,
+ const struct cl_req_slice *slice,
+ const struct cl_object *obj,
+ struct cl_req_attr *attr, u64 flags)
+{
+ struct lov_oinfo *oinfo;
+ struct cl_req *clerq;
+ struct cl_page *apage; /* _some_ page in @clerq */
+ struct cl_lock *lock; /* _some_ lock protecting @apage */
+ struct osc_lock *olck;
+ struct osc_page *opg;
+ struct obdo *oa;
+ struct ost_lvb *lvb;
+
+ oinfo = cl2osc(obj)->oo_oinfo;
+ lvb = &oinfo->loi_lvb;
+ oa = attr->cra_oa;
+
+ if ((flags & OBD_MD_FLMTIME) != 0) {
+ oa->o_mtime = lvb->lvb_mtime;
+ oa->o_valid |= OBD_MD_FLMTIME;
+ }
+ if ((flags & OBD_MD_FLATIME) != 0) {
+ oa->o_atime = lvb->lvb_atime;
+ oa->o_valid |= OBD_MD_FLATIME;
+ }
+ if ((flags & OBD_MD_FLCTIME) != 0) {
+ oa->o_ctime = lvb->lvb_ctime;
+ oa->o_valid |= OBD_MD_FLCTIME;
+ }
+ if (flags & OBD_MD_FLGROUP) {
+ ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+ oa->o_valid |= OBD_MD_FLGROUP;
+ }
+ if (flags & OBD_MD_FLID) {
+ ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+ oa->o_valid |= OBD_MD_FLID;
+ }
+ if (flags & OBD_MD_FLHANDLE) {
+ clerq = slice->crs_req;
+ LASSERT(!list_empty(&clerq->crq_pages));
+ apage = container_of(clerq->crq_pages.next,
+ struct cl_page, cp_flight);
+ opg = osc_cl_page_osc(apage);
+ apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
+ lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
+ if (lock == NULL) {
+ struct cl_object_header *head;
+ struct cl_lock *scan;
+
+ head = cl_object_header(apage->cp_obj);
+ list_for_each_entry(scan, &head->coh_locks,
+ cll_linkage)
+ CL_LOCK_DEBUG(D_ERROR, env, scan,
+ "no cover page!\n");
+ CL_PAGE_DEBUG(D_ERROR, env, apage,
+ "dump uncover page!\n");
+ dump_stack();
+ LBUG();
+ }
+
+ olck = osc_lock_at(lock);
+ LASSERT(olck != NULL);
+ LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL));
+ /* check for lockless io. */
+ if (olck->ols_lock != NULL) {
+ oa->o_handle = olck->ols_lock->l_remote_handle;
+ oa->o_valid |= OBD_MD_FLHANDLE;
+ }
+ cl_lock_put(env, lock);
+ }
+}
+
+static const struct cl_req_operations osc_req_ops = {
+ .cro_prep = osc_req_prep,
+ .cro_attr_set = osc_req_attr_set,
+ .cro_completion = osc_req_completion
+};
+
+
+int osc_io_init(const struct lu_env *env,
+ struct cl_object *obj, struct cl_io *io)
+{
+ struct osc_io *oio = osc_env_io(env);
+
+ CL_IO_SLICE_CLEAN(oio, oi_cl);
+ cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+ return 0;
+}
+
+int osc_req_init(const struct lu_env *env, struct cl_device *dev,
+ struct cl_req *req)
+{
+ struct osc_req *or;
+ int result;
+
+ OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, GFP_NOFS);
+ if (or != NULL) {
+ cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops);
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c
new file mode 100644
index 000000000..350ad4955
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_lock.c
@@ -0,0 +1,1613 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "../../include/linux/libcfs/libcfs.h"
+/* fid_build_reg_res_name() */
+#include "../include/lustre_fid.h"
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+#define _PAGEREF_MAGIC (-10000000)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+ struct osc_lock *ols, int force);
+static int osc_lock_has_pages(struct osc_lock *olck);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+ return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+ struct ldlm_lock *lock;
+
+ lock = ldlm_handle2lock(handle);
+ if (lock != NULL)
+ LDLM_LOCK_PUT(lock);
+ return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+ struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle);
+ struct ldlm_lock *olock = ols->ols_lock;
+ int handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+ if (ergo(osc_lock_is_lockless(ols),
+ ols->ols_locklessable && ols->ols_lock == NULL))
+ return 1;
+
+ /*
+ * If all the following "ergo"s are true, return 1, otherwise 0
+ */
+ if (!ergo(olock != NULL, handle_used))
+ return 0;
+
+ if (!ergo(olock != NULL,
+ olock->l_handle.h_cookie == ols->ols_handle.cookie))
+ return 0;
+
+ if (!ergo(handle_used,
+ ergo(lock != NULL && olock != NULL, lock == olock) &&
+ ergo(lock == NULL, olock == NULL)))
+ return 0;
+ /*
+ * Check that ->ols_handle and ->ols_lock are consistent, but
+ * take into account that they are set at the different time.
+ */
+ if (!ergo(ols->ols_state == OLS_CANCELLED,
+ olock == NULL && !handle_used))
+ return 0;
+ /*
+ * DLM lock is destroyed only after we have seen cancellation
+ * ast.
+ */
+ if (!ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+ ((olock->l_flags & LDLM_FL_DESTROYED) == 0)))
+ return 0;
+
+ if (!ergo(ols->ols_state == OLS_GRANTED,
+ olock != NULL &&
+ olock->l_req_mode == olock->l_granted_mode &&
+ ols->ols_hold))
+ return 0;
+ return 1;
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+ struct ldlm_lock *dlmlock;
+
+ spin_lock(&osc_ast_guard);
+ dlmlock = olck->ols_lock;
+ if (dlmlock == NULL) {
+ spin_unlock(&osc_ast_guard);
+ return;
+ }
+
+ olck->ols_lock = NULL;
+ /* wb(); --- for all who checks (ols->ols_lock != NULL) before
+ * call to osc_lock_detach() */
+ dlmlock->l_ast_data = NULL;
+ olck->ols_handle.cookie = 0ULL;
+ spin_unlock(&osc_ast_guard);
+
+ lock_res_and_lock(dlmlock);
+ if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+ struct cl_object *obj = olck->ols_cl.cls_obj;
+ struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+ __u64 old_kms;
+
+ cl_object_attr_lock(obj);
+ /* Must get the value under the lock to avoid possible races. */
+ old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+ /* Update the kms. Need to loop all granted locks.
+ * Not a problem for the client */
+ attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+ cl_object_attr_set(env, obj, attr, CAT_KMS);
+ cl_object_attr_unlock(obj);
+ }
+ unlock_res_and_lock(dlmlock);
+
+ /* release a reference taken in osc_lock_upcall0(). */
+ LASSERT(olck->ols_has_ref);
+ lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+ LDLM_LOCK_RELEASE(dlmlock);
+ olck->ols_has_ref = 0;
+}
+
+static int osc_lock_unhold(struct osc_lock *ols)
+{
+ int result = 0;
+
+ if (ols->ols_hold) {
+ ols->ols_hold = 0;
+ result = osc_cancel_base(&ols->ols_handle,
+ ols->ols_einfo.ei_mode);
+ }
+ return result;
+}
+
+static int osc_lock_unuse(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct osc_lock *ols = cl2osc_lock(slice);
+
+ LINVRNT(osc_lock_invariant(ols));
+
+ switch (ols->ols_state) {
+ case OLS_NEW:
+ LASSERT(!ols->ols_hold);
+ LASSERT(ols->ols_agl);
+ return 0;
+ case OLS_UPCALL_RECEIVED:
+ osc_lock_unhold(ols);
+ case OLS_ENQUEUED:
+ LASSERT(!ols->ols_hold);
+ osc_lock_detach(env, ols);
+ ols->ols_state = OLS_NEW;
+ return 0;
+ case OLS_GRANTED:
+ LASSERT(!ols->ols_glimpse);
+ LASSERT(ols->ols_hold);
+ /*
+ * Move lock into OLS_RELEASED state before calling
+ * osc_cancel_base() so that possible synchronous cancellation
+ * (that always happens e.g., for liblustre) sees that lock is
+ * released.
+ */
+ ols->ols_state = OLS_RELEASED;
+ return osc_lock_unhold(ols);
+ default:
+ CERROR("Impossible state: %d\n", ols->ols_state);
+ LBUG();
+ }
+}
+
+static void osc_lock_fini(const struct lu_env *env,
+ struct cl_lock_slice *slice)
+{
+ struct osc_lock *ols = cl2osc_lock(slice);
+
+ LINVRNT(osc_lock_invariant(ols));
+ /*
+ * ->ols_hold can still be true at this point if, for example, a
+ * thread that requested a lock was killed (and released a reference
+ * to the lock), before reply from a server was received. In this case
+ * lock is destroyed immediately after upcall.
+ */
+ osc_lock_unhold(ols);
+ LASSERT(ols->ols_lock == NULL);
+ LASSERT(atomic_read(&ols->ols_pageref) == 0 ||
+ atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC);
+
+ OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+ const struct cl_lock *lock,
+ ldlm_policy_data_t *policy)
+{
+ const struct cl_lock_descr *d = &lock->cll_descr;
+
+ osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+ policy->l_extent.gid = d->cld_gid;
+}
+
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+ __u64 result = 0;
+
+ LASSERT((enqflags & ~CEF_MASK) == 0);
+
+ if (enqflags & CEF_NONBLOCK)
+ result |= LDLM_FL_BLOCK_NOWAIT;
+ if (enqflags & CEF_ASYNC)
+ result |= LDLM_FL_HAS_INTENT;
+ if (enqflags & CEF_DISCARD_DATA)
+ result |= LDLM_FL_AST_DISCARD_DATA;
+ return result;
+}
+
+/**
+ * Global spin-lock protecting consistency of ldlm_lock::l_ast_data
+ * pointers. Initialized in osc_init().
+ */
+spinlock_t osc_ast_guard;
+
+static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock)
+{
+ struct osc_lock *olck;
+
+ lock_res_and_lock(dlm_lock);
+ spin_lock(&osc_ast_guard);
+ olck = dlm_lock->l_ast_data;
+ if (olck != NULL) {
+ struct cl_lock *lock = olck->ols_cl.cls_lock;
+ /*
+ * If osc_lock holds a reference on ldlm lock, return it even
+ * when cl_lock is in CLS_FREEING state. This way
+ *
+ * osc_ast_data_get(dlmlock) == NULL
+ *
+ * guarantees that all osc references on dlmlock were
+ * released. osc_dlm_blocking_ast0() relies on that.
+ */
+ if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) {
+ cl_lock_get_trust(lock);
+ lu_ref_add_atomic(&lock->cll_reference,
+ "ast", current);
+ } else
+ olck = NULL;
+ }
+ spin_unlock(&osc_ast_guard);
+ unlock_res_and_lock(dlm_lock);
+ return olck;
+}
+
+static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
+{
+ struct cl_lock *lock;
+
+ lock = olck->ols_cl.cls_lock;
+ lu_ref_del(&lock->cll_reference, "ast", current);
+ cl_lock_put(env, lock);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
+ int rc)
+{
+ struct ost_lvb *lvb;
+ struct cl_object *obj;
+ struct lov_oinfo *oinfo;
+ struct cl_attr *attr;
+ unsigned valid;
+
+ if (!(olck->ols_flags & LDLM_FL_LVB_READY))
+ return;
+
+ lvb = &olck->ols_lvb;
+ obj = olck->ols_cl.cls_obj;
+ oinfo = cl2osc(obj)->oo_oinfo;
+ attr = &osc_env_info(env)->oti_attr;
+ valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+ cl_lvb2attr(attr, lvb);
+
+ cl_object_attr_lock(obj);
+ if (rc == 0) {
+ struct ldlm_lock *dlmlock;
+ __u64 size;
+
+ dlmlock = olck->ols_lock;
+ LASSERT(dlmlock != NULL);
+
+ /* re-grab LVB from a dlm lock under DLM spin-locks. */
+ *lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+ size = lvb->lvb_size;
+ /* Extend KMS up to the end of this lock and no further
+ * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+ if (size > dlmlock->l_policy_data.l_extent.end)
+ size = dlmlock->l_policy_data.l_extent.end + 1;
+ if (size >= oinfo->loi_kms) {
+ LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu, kms=%llu",
+ lvb->lvb_size, size);
+ valid |= CAT_KMS;
+ attr->cat_kms = size;
+ } else {
+ LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu; leaving kms=%llu, end=%llu",
+ lvb->lvb_size, oinfo->loi_kms,
+ dlmlock->l_policy_data.l_extent.end);
+ }
+ ldlm_lock_allow_match_locked(dlmlock);
+ } else if (rc == -ENAVAIL && olck->ols_glimpse) {
+ CDEBUG(D_INODE, "glimpsed, setting rss=%llu; leaving kms=%llu\n",
+ lvb->lvb_size, oinfo->loi_kms);
+ } else
+ valid = 0;
+
+ if (valid != 0)
+ cl_object_attr_set(env, obj, attr, valid);
+
+ cl_object_attr_unlock(obj);
+}
+
+/**
+ * Called when a lock is granted, from an upcall (when server returned a
+ * granted lock), or from completion AST, when server returned a blocked lock.
+ *
+ * Called under lock and resource spin-locks, that are released temporarily
+ * here.
+ */
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
+ struct ldlm_lock *dlmlock, int rc)
+{
+ struct ldlm_extent *ext;
+ struct cl_lock *lock;
+ struct cl_lock_descr *descr;
+
+ LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+ if (olck->ols_state < OLS_GRANTED) {
+ lock = olck->ols_cl.cls_lock;
+ ext = &dlmlock->l_policy_data.l_extent;
+ descr = &osc_env_info(env)->oti_descr;
+ descr->cld_obj = lock->cll_descr.cld_obj;
+
+ /* XXX check that ->l_granted_mode is valid. */
+ descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+ descr->cld_start = cl_index(descr->cld_obj, ext->start);
+ descr->cld_end = cl_index(descr->cld_obj, ext->end);
+ descr->cld_gid = ext->gid;
+ /*
+ * tell upper layers the extent of the lock that was actually
+ * granted
+ */
+ olck->ols_state = OLS_GRANTED;
+ osc_lock_lvb_update(env, olck, rc);
+
+ /* release DLM spin-locks to allow cl_lock_{modify,signal}()
+ * to take a semaphore on a parent lock. This is safe, because
+ * spin-locks are needed to protect consistency of
+ * dlmlock->l_*_mode and LVB, and we have finished processing
+ * them. */
+ unlock_res_and_lock(dlmlock);
+ cl_lock_modify(env, lock, descr);
+ cl_lock_signal(env, lock);
+ LINVRNT(osc_lock_invariant(olck));
+ lock_res_and_lock(dlmlock);
+ }
+}
+
+static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
+
+{
+ struct ldlm_lock *dlmlock;
+
+ dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0);
+ LASSERT(dlmlock != NULL);
+
+ lock_res_and_lock(dlmlock);
+ spin_lock(&osc_ast_guard);
+ LASSERT(dlmlock->l_ast_data == olck);
+ LASSERT(olck->ols_lock == NULL);
+ olck->ols_lock = dlmlock;
+ spin_unlock(&osc_ast_guard);
+
+ /*
+ * Lock might be not yet granted. In this case, completion ast
+ * (osc_ldlm_completion_ast()) comes later and finishes lock
+ * granting.
+ */
+ if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
+ osc_lock_granted(env, olck, dlmlock, 0);
+ unlock_res_and_lock(dlmlock);
+
+ /*
+ * osc_enqueue_interpret() decrefs asynchronous locks, counter
+ * this.
+ */
+ ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+ olck->ols_hold = 1;
+
+ /* lock reference taken by ldlm_handle2lock_long() is owned by
+ * osc_lock and released in osc_lock_detach() */
+ lu_ref_add(&dlmlock->l_reference, "osc_lock", olck);
+ olck->ols_has_ref = 1;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, int errcode)
+{
+ struct osc_lock *olck = cookie;
+ struct cl_lock_slice *slice = &olck->ols_cl;
+ struct cl_lock *lock = slice->cls_lock;
+ struct lu_env *env;
+ struct cl_env_nest nest;
+
+ env = cl_env_nested_get(&nest);
+ if (!IS_ERR(env)) {
+ int rc;
+
+ cl_lock_mutex_get(env, lock);
+
+ LASSERT(lock->cll_state >= CLS_QUEUING);
+ if (olck->ols_state == OLS_ENQUEUED) {
+ olck->ols_state = OLS_UPCALL_RECEIVED;
+ rc = ldlm_error2errno(errcode);
+ } else if (olck->ols_state == OLS_CANCELLED) {
+ rc = -EIO;
+ } else {
+ CERROR("Impossible state: %d\n", olck->ols_state);
+ LBUG();
+ }
+ if (rc) {
+ struct ldlm_lock *dlmlock;
+
+ dlmlock = ldlm_handle2lock(&olck->ols_handle);
+ if (dlmlock != NULL) {
+ lock_res_and_lock(dlmlock);
+ spin_lock(&osc_ast_guard);
+ LASSERT(olck->ols_lock == NULL);
+ dlmlock->l_ast_data = NULL;
+ olck->ols_handle.cookie = 0ULL;
+ spin_unlock(&osc_ast_guard);
+ ldlm_lock_fail_match_locked(dlmlock);
+ unlock_res_and_lock(dlmlock);
+ LDLM_LOCK_PUT(dlmlock);
+ }
+ } else {
+ if (olck->ols_glimpse)
+ olck->ols_glimpse = 0;
+ osc_lock_upcall0(env, olck);
+ }
+
+ /* Error handling, some errors are tolerable. */
+ if (olck->ols_locklessable && rc == -EUSERS) {
+ /* This is a tolerable error, turn this lock into
+ * lockless lock.
+ */
+ osc_object_set_contended(cl2osc(slice->cls_obj));
+ LASSERT(slice->cls_ops == &osc_lock_ops);
+
+ /* Change this lock to ldlmlock-less lock. */
+ osc_lock_to_lockless(env, olck, 1);
+ olck->ols_state = OLS_GRANTED;
+ rc = 0;
+ } else if (olck->ols_glimpse && rc == -ENAVAIL) {
+ osc_lock_lvb_update(env, olck, rc);
+ cl_lock_delete(env, lock);
+ /* Hide the error. */
+ rc = 0;
+ }
+
+ if (rc == 0) {
+ /* For AGL case, the RPC sponsor may exits the cl_lock
+ * processing without wait() called before related OSC
+ * lock upcall(). So update the lock status according
+ * to the enqueue result inside AGL upcall(). */
+ if (olck->ols_agl) {
+ lock->cll_flags |= CLF_FROM_UPCALL;
+ cl_wait_try(env, lock);
+ lock->cll_flags &= ~CLF_FROM_UPCALL;
+ if (!olck->ols_glimpse)
+ olck->ols_agl = 0;
+ }
+ cl_lock_signal(env, lock);
+ /* del user for lock upcall cookie */
+ cl_unuse_try(env, lock);
+ } else {
+ /* del user for lock upcall cookie */
+ cl_lock_user_del(env, lock);
+ cl_lock_error(env, lock, rc);
+ }
+
+ /* release cookie reference, acquired by osc_lock_enqueue() */
+ cl_lock_hold_release(env, lock, "upcall", lock);
+ cl_lock_mutex_put(env, lock);
+
+ lu_ref_del(&lock->cll_reference, "upcall", lock);
+ /* This maybe the last reference, so must be called after
+ * cl_lock_mutex_put(). */
+ cl_lock_put(env, lock);
+
+ cl_env_nested_put(&nest, env);
+ } else {
+ /* should never happen, similar to osc_ldlm_blocking_ast(). */
+ LBUG();
+ }
+ return errcode;
+}
+
+/**
+ * Core of osc_dlm_blocking_ast() logic.
+ */
+static void osc_lock_blocking(const struct lu_env *env,
+ struct ldlm_lock *dlmlock,
+ struct osc_lock *olck, int blocking)
+{
+ struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+ LASSERT(olck->ols_lock == dlmlock);
+ CLASSERT(OLS_BLOCKED < OLS_CANCELLED);
+ LASSERT(!osc_lock_is_lockless(olck));
+
+ /*
+ * Lock might be still addref-ed here, if e.g., blocking ast
+ * is sent for a failed lock.
+ */
+ osc_lock_unhold(olck);
+
+ if (blocking && olck->ols_state < OLS_BLOCKED)
+ /*
+ * Move osc_lock into OLS_BLOCKED before canceling the lock,
+ * because it recursively re-enters osc_lock_blocking(), with
+ * the state set to OLS_CANCELLED.
+ */
+ olck->ols_state = OLS_BLOCKED;
+ /*
+ * cancel and destroy lock at least once no matter how blocking ast is
+ * entered (see comment above osc_ldlm_blocking_ast() for use
+ * cases). cl_lock_cancel() and cl_lock_delete() are idempotent.
+ */
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+ struct ldlm_lock *dlmlock,
+ void *data, int flag)
+{
+ struct osc_lock *olck;
+ struct cl_lock *lock;
+ int result;
+ int cancel;
+
+ LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING);
+
+ cancel = 0;
+ olck = osc_ast_data_get(dlmlock);
+ if (olck != NULL) {
+ lock = olck->ols_cl.cls_lock;
+ cl_lock_mutex_get(env, lock);
+ LINVRNT(osc_lock_invariant(olck));
+ if (olck->ols_ast_wait) {
+ /* wake up osc_lock_use() */
+ cl_lock_signal(env, lock);
+ olck->ols_ast_wait = 0;
+ }
+ /*
+ * Lock might have been canceled while this thread was
+ * sleeping for lock mutex, but olck is pinned in memory.
+ */
+ if (olck == dlmlock->l_ast_data) {
+ /*
+ * NOTE: DLM sends blocking AST's for failed locks
+ * (that are still in pre-OLS_GRANTED state)
+ * too, and they have to be canceled otherwise
+ * DLM lock is never destroyed and stuck in
+ * the memory.
+ *
+ * Alternatively, ldlm_cli_cancel() can be
+ * called here directly for osc_locks with
+ * ols_state < OLS_GRANTED to maintain an
+ * invariant that ->clo_cancel() is only called
+ * for locks that were granted.
+ */
+ LASSERT(data == olck);
+ osc_lock_blocking(env, dlmlock,
+ olck, flag == LDLM_CB_BLOCKING);
+ } else
+ cancel = 1;
+ cl_lock_mutex_put(env, lock);
+ osc_ast_data_put(env, olck);
+ } else
+ /*
+ * DLM lock exists, but there is no cl_lock attached to it.
+ * This is a `normal' race. cl_object and its cl_lock's can be
+ * removed by memory pressure, together with all pages.
+ */
+ cancel = (flag == LDLM_CB_BLOCKING);
+
+ if (cancel) {
+ struct lustre_handle *lockh;
+
+ lockh = &osc_env_info(env)->oti_handle;
+ ldlm_lock2handle(dlmlock, lockh);
+ result = ldlm_cli_cancel(lockh, LCF_ASYNC);
+ } else
+ result = 0;
+ return result;
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ * cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ * lock due to lock lru pressure, or explicit user request to purge
+ * locks.
+ *
+ * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ * us that dlmlock conflicts with another lock that some client is
+ * enqueing. Lock is canceled.
+ *
+ * - cl_lock_cancel() is called. osc_lock_cancel() calls
+ * ldlm_cli_cancel() that calls
+ *
+ * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ * recursively entering osc_ldlm_blocking_ast().
+ *
+ * - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ * cl_lock_cancel()->
+ * osc_lock_cancel()->
+ * ldlm_cli_cancel()->
+ * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+ struct ldlm_lock_desc *new, void *data,
+ int flag)
+{
+ struct lu_env *env;
+ struct cl_env_nest nest;
+ int result;
+
+ /*
+ * This can be called in the context of outer IO, e.g.,
+ *
+ * cl_enqueue()->...
+ * ->osc_enqueue_base()->...
+ * ->ldlm_prep_elc_req()->...
+ * ->ldlm_cancel_callback()->...
+ * ->osc_ldlm_blocking_ast()
+ *
+ * new environment has to be created to not corrupt outer context.
+ */
+ env = cl_env_nested_get(&nest);
+ if (!IS_ERR(env)) {
+ result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+ cl_env_nested_put(&nest, env);
+ } else {
+ result = PTR_ERR(env);
+ /*
+ * XXX This should never happen, as cl_lock is
+ * stuck. Pre-allocated environment a la vvp_inode_fini_env
+ * should be used.
+ */
+ LBUG();
+ }
+ if (result != 0) {
+ if (result == -ENODATA)
+ result = 0;
+ else
+ CERROR("BAST failed: %d\n", result);
+ }
+ return result;
+}
+
+static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
+ __u64 flags, void *data)
+{
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ struct osc_lock *olck;
+ struct cl_lock *lock;
+ int result;
+ int dlmrc;
+
+ /* first, do dlm part of the work */
+ dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
+ /* then, notify cl_lock */
+ env = cl_env_nested_get(&nest);
+ if (!IS_ERR(env)) {
+ olck = osc_ast_data_get(dlmlock);
+ if (olck != NULL) {
+ lock = olck->ols_cl.cls_lock;
+ cl_lock_mutex_get(env, lock);
+ /*
+ * ldlm_handle_cp_callback() copied LVB from request
+ * to lock->l_lvb_data, store it in osc_lock.
+ */
+ LASSERT(dlmlock->l_lvb_data != NULL);
+ lock_res_and_lock(dlmlock);
+ olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+ if (olck->ols_lock == NULL) {
+ /*
+ * upcall (osc_lock_upcall()) hasn't yet been
+ * called. Do nothing now, upcall will bind
+ * olck to dlmlock and signal the waiters.
+ *
+ * This maintains an invariant that osc_lock
+ * and ldlm_lock are always bound when
+ * osc_lock is in OLS_GRANTED state.
+ */
+ } else if (dlmlock->l_granted_mode ==
+ dlmlock->l_req_mode) {
+ osc_lock_granted(env, olck, dlmlock, dlmrc);
+ }
+ unlock_res_and_lock(dlmlock);
+
+ if (dlmrc != 0) {
+ CL_LOCK_DEBUG(D_ERROR, env, lock,
+ "dlmlock returned %d\n", dlmrc);
+ cl_lock_error(env, lock, dlmrc);
+ }
+ cl_lock_mutex_put(env, lock);
+ osc_ast_data_put(env, olck);
+ result = 0;
+ } else
+ result = -ELDLM_NO_LOCK_DATA;
+ cl_env_nested_put(&nest, env);
+ } else
+ result = PTR_ERR(env);
+ return dlmrc ?: result;
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+ struct ptlrpc_request *req = data;
+ struct osc_lock *olck;
+ struct cl_lock *lock;
+ struct cl_object *obj;
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ struct ost_lvb *lvb;
+ struct req_capsule *cap;
+ int result;
+
+ LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+ env = cl_env_nested_get(&nest);
+ if (!IS_ERR(env)) {
+ /* osc_ast_data_get() has to go after environment is
+ * allocated, because osc_ast_data() acquires a
+ * reference to a lock, and it can only be released in
+ * environment.
+ */
+ olck = osc_ast_data_get(dlmlock);
+ if (olck != NULL) {
+ lock = olck->ols_cl.cls_lock;
+ /* Do not grab the mutex of cl_lock for glimpse.
+ * See LU-1274 for details.
+ * BTW, it's okay for cl_lock to be cancelled during
+ * this period because server can handle this race.
+ * See ldlm_server_glimpse_ast() for details.
+ * cl_lock_mutex_get(env, lock); */
+ cap = &req->rq_pill;
+ req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+ req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+ sizeof(*lvb));
+ result = req_capsule_server_pack(cap);
+ if (result == 0) {
+ lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+ obj = lock->cll_descr.cld_obj;
+ result = cl_object_glimpse(env, obj, lvb);
+ }
+ if (!exp_connect_lvb_type(req->rq_export))
+ req_capsule_shrink(&req->rq_pill,
+ &RMF_DLM_LVB,
+ sizeof(struct ost_lvb_v1),
+ RCL_SERVER);
+ osc_ast_data_put(env, olck);
+ } else {
+ /*
+ * These errors are normal races, so we don't want to
+ * fill the console with messages by calling
+ * ptlrpc_error()
+ */
+ lustre_pack_reply(req, 1, NULL, NULL);
+ result = -ELDLM_NO_LOCK_DATA;
+ }
+ cl_env_nested_put(&nest, env);
+ } else
+ result = PTR_ERR(env);
+ req->rq_status = result;
+ return result;
+}
+
+static unsigned long osc_lock_weigh(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ /*
+ * don't need to grab coh_page_guard since we don't care the exact #
+ * of pages..
+ */
+ return cl_object_header(slice->cls_obj)->coh_pages;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+ const struct cl_lock *clock,
+ struct osc_lock *lock,
+ struct ldlm_enqueue_info *einfo)
+{
+ enum cl_lock_mode mode;
+
+ mode = clock->cll_descr.cld_mode;
+ if (mode == CLM_PHANTOM)
+ /*
+ * For now, enqueue all glimpse locks in read mode. In the
+ * future, client might choose to enqueue LCK_PW lock for
+ * glimpse on a file opened for write.
+ */
+ mode = CLM_READ;
+
+ einfo->ei_type = LDLM_EXTENT;
+ einfo->ei_mode = osc_cl_lock2ldlm(mode);
+ einfo->ei_cb_bl = osc_ldlm_blocking_ast;
+ einfo->ei_cb_cp = osc_ldlm_completion_ast;
+ einfo->ei_cb_gl = osc_ldlm_glimpse_ast;
+ einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicit requirement for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ * Additional policy can be implemented here, e.g., never do lockless-io
+ * for large extents.
+ */
+static void osc_lock_to_lockless(const struct lu_env *env,
+ struct osc_lock *ols, int force)
+{
+ struct cl_lock_slice *slice = &ols->ols_cl;
+
+ LASSERT(ols->ols_state == OLS_NEW ||
+ ols->ols_state == OLS_UPCALL_RECEIVED);
+
+ if (force) {
+ ols->ols_locklessable = 1;
+ slice->cls_ops = &osc_lock_lockless_ops;
+ } else {
+ struct osc_io *oio = osc_env_io(env);
+ struct cl_io *io = oio->oi_cl.cis_io;
+ struct cl_object *obj = slice->cls_obj;
+ struct osc_object *oob = cl2osc(obj);
+ const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+ struct obd_connect_data *ocd;
+
+ LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+ io->ci_lockreq == CILR_MAYBE ||
+ io->ci_lockreq == CILR_NEVER);
+
+ ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+ ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+ (io->ci_lockreq == CILR_MAYBE) &&
+ (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+ if (io->ci_lockreq == CILR_NEVER ||
+ /* lockless IO */
+ (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+ /* lockless truncate */
+ (cl_io_is_trunc(io) &&
+ (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+ osd->od_lockless_truncate)) {
+ ols->ols_locklessable = 1;
+ slice->cls_ops = &osc_lock_lockless_ops;
+ }
+ }
+ LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+
+static int osc_lock_compatible(const struct osc_lock *qing,
+ const struct osc_lock *qed)
+{
+ enum cl_lock_mode qing_mode;
+ enum cl_lock_mode qed_mode;
+
+ qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode;
+ if (qed->ols_glimpse &&
+ (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ))
+ return 1;
+
+ qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode;
+ return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ));
+}
+
+/**
+ * Cancel all conflicting locks and wait for them to be destroyed.
+ *
+ * This function is used for two purposes:
+ *
+ * - early cancel all conflicting locks before starting IO, and
+ *
+ * - guarantee that pages added to the page cache by lockless IO are never
+ * covered by locks other than lockless IO lock, and, hence, are not
+ * visible to other threads.
+ */
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+ const struct osc_lock *olck)
+{
+ struct cl_lock *lock = olck->ols_cl.cls_lock;
+ struct cl_lock_descr *descr = &lock->cll_descr;
+ struct cl_object_header *hdr = cl_object_header(descr->cld_obj);
+ struct cl_lock *scan;
+ struct cl_lock *conflict = NULL;
+ int lockless = osc_lock_is_lockless(olck);
+ int rc = 0;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+
+ /* make it enqueue anyway for glimpse lock, because we actually
+ * don't need to cancel any conflicting locks. */
+ if (olck->ols_glimpse)
+ return 0;
+
+ spin_lock(&hdr->coh_lock_guard);
+ list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+ struct cl_lock_descr *cld = &scan->cll_descr;
+ const struct osc_lock *scan_ols;
+
+ if (scan == lock)
+ break;
+
+ if (scan->cll_state < CLS_QUEUING ||
+ scan->cll_state == CLS_FREEING ||
+ cld->cld_start > descr->cld_end ||
+ cld->cld_end < descr->cld_start)
+ continue;
+
+ /* overlapped and living locks. */
+
+ /* We're not supposed to give up group lock. */
+ if (scan->cll_descr.cld_mode == CLM_GROUP) {
+ LASSERT(descr->cld_mode != CLM_GROUP ||
+ descr->cld_gid != scan->cll_descr.cld_gid);
+ continue;
+ }
+
+ scan_ols = osc_lock_at(scan);
+
+ /* We need to cancel the compatible locks if we're enqueuing
+ * a lockless lock, for example:
+ * imagine that client has PR lock on [0, 1000], and thread T0
+ * is doing lockless IO in [500, 1500] region. Concurrent
+ * thread T1 can see lockless data in [500, 1000], which is
+ * wrong, because these data are possibly stale. */
+ if (!lockless && osc_lock_compatible(olck, scan_ols))
+ continue;
+
+ cl_lock_get_trust(scan);
+ conflict = scan;
+ break;
+ }
+ spin_unlock(&hdr->coh_lock_guard);
+
+ if (conflict) {
+ if (lock->cll_descr.cld_mode == CLM_GROUP) {
+ /* we want a group lock but a previous lock request
+ * conflicts, we do not wait but return 0 so the
+ * request is send to the server
+ */
+ CDEBUG(D_DLMTRACE, "group lock %p is conflicted with %p, no wait, send to server\n",
+ lock, conflict);
+ cl_lock_put(env, conflict);
+ rc = 0;
+ } else {
+ CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, will wait\n",
+ lock, conflict);
+ LASSERT(lock->cll_conflict == NULL);
+ lu_ref_add(&conflict->cll_reference, "cancel-wait",
+ lock);
+ lock->cll_conflict = conflict;
+ rc = CLO_WAIT;
+ }
+ }
+ return rc;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ * - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ * - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ struct cl_io *unused, __u32 enqflags)
+{
+ struct osc_lock *ols = cl2osc_lock(slice);
+ struct cl_lock *lock = ols->ols_cl.cls_lock;
+ int result;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+ LASSERTF(ols->ols_state == OLS_NEW,
+ "Impossible state: %d\n", ols->ols_state);
+
+ LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+ "lock = %p, ols = %p\n", lock, ols);
+
+ result = osc_lock_enqueue_wait(env, ols);
+ if (result == 0) {
+ if (!osc_lock_is_lockless(ols)) {
+ struct osc_object *obj = cl2osc(slice->cls_obj);
+ struct osc_thread_info *info = osc_env_info(env);
+ struct ldlm_res_id *resname = &info->oti_resname;
+ ldlm_policy_data_t *policy = &info->oti_policy;
+ struct ldlm_enqueue_info *einfo = &ols->ols_einfo;
+
+ /* lock will be passed as upcall cookie,
+ * hold ref to prevent to be released. */
+ cl_lock_hold_add(env, lock, "upcall", lock);
+ /* a user for lock also */
+ cl_lock_user_add(env, lock);
+ ols->ols_state = OLS_ENQUEUED;
+
+ /*
+ * XXX: this is possible blocking point as
+ * ldlm_lock_match(LDLM_FL_LVB_READY) waits for
+ * LDLM_CP_CALLBACK.
+ */
+ ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+ osc_lock_build_policy(env, lock, policy);
+ result = osc_enqueue_base(osc_export(obj), resname,
+ &ols->ols_flags, policy,
+ &ols->ols_lvb,
+ obj->oo_oinfo->loi_kms_valid,
+ osc_lock_upcall,
+ ols, einfo, &ols->ols_handle,
+ PTLRPCD_SET, 1, ols->ols_agl);
+ if (result != 0) {
+ cl_lock_user_del(env, lock);
+ cl_lock_unhold(env, lock, "upcall", lock);
+ if (unlikely(result == -ECANCELED)) {
+ ols->ols_state = OLS_NEW;
+ result = 0;
+ }
+ }
+ } else {
+ ols->ols_state = OLS_GRANTED;
+ ols->ols_owner = osc_env_io(env);
+ }
+ }
+ LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+ return result;
+}
+
+static int osc_lock_wait(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct osc_lock *olck = cl2osc_lock(slice);
+ struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+ LINVRNT(osc_lock_invariant(olck));
+
+ if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) {
+ if (olck->ols_flags & LDLM_FL_LVB_READY) {
+ return 0;
+ } else if (olck->ols_agl) {
+ if (lock->cll_flags & CLF_FROM_UPCALL)
+ /* It is from enqueue RPC reply upcall for
+ * updating state. Do not re-enqueue. */
+ return -ENAVAIL;
+ else
+ olck->ols_state = OLS_NEW;
+ } else {
+ LASSERT(lock->cll_error);
+ return lock->cll_error;
+ }
+ }
+
+ if (olck->ols_state == OLS_NEW) {
+ int rc;
+
+ LASSERT(olck->ols_agl);
+ olck->ols_agl = 0;
+ olck->ols_flags &= ~LDLM_FL_BLOCK_NOWAIT;
+ rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST);
+ if (rc != 0)
+ return rc;
+ else
+ return CLO_REENQUEUED;
+ }
+
+ LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED &&
+ lock->cll_error == 0, olck->ols_lock != NULL));
+
+ return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_use() method that pins cached
+ * lock.
+ */
+static int osc_lock_use(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct osc_lock *olck = cl2osc_lock(slice);
+ int rc;
+
+ LASSERT(!olck->ols_hold);
+
+ /*
+ * Atomically check for LDLM_FL_CBPENDING and addref a lock if this
+ * flag is not set. This protects us from a concurrent blocking ast.
+ */
+ rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode);
+ if (rc == 0) {
+ olck->ols_hold = 1;
+ olck->ols_state = OLS_GRANTED;
+ } else {
+ struct cl_lock *lock;
+
+ /*
+ * Lock is being cancelled somewhere within
+ * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already
+ * set, but osc_ldlm_blocking_ast() hasn't yet acquired
+ * cl_lock mutex.
+ */
+ lock = slice->cls_lock;
+ LASSERT(lock->cll_state == CLS_INTRANSIT);
+ LASSERT(lock->cll_users > 0);
+ /* set a flag for osc_dlm_blocking_ast0() to signal the
+ * lock.*/
+ olck->ols_ast_wait = 1;
+ rc = CLO_WAIT;
+ }
+ return rc;
+}
+
+static int osc_lock_flush(struct osc_lock *ols, int discard)
+{
+ struct cl_lock *lock = ols->ols_cl.cls_lock;
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ int result = 0;
+
+ env = cl_env_nested_get(&nest);
+ if (!IS_ERR(env)) {
+ struct osc_object *obj = cl2osc(ols->ols_cl.cls_obj);
+ struct cl_lock_descr *descr = &lock->cll_descr;
+ int rc = 0;
+
+ if (descr->cld_mode >= CLM_WRITE) {
+ result = osc_cache_writeback_range(env, obj,
+ descr->cld_start, descr->cld_end,
+ 1, discard);
+ LDLM_DEBUG(ols->ols_lock,
+ "lock %p: %d pages were %s.\n", lock, result,
+ discard ? "discarded" : "written");
+ if (result > 0)
+ result = 0;
+ }
+
+ rc = cl_lock_discard_pages(env, lock);
+ if (result == 0 && rc < 0)
+ result = rc;
+
+ cl_env_nested_put(&nest, env);
+ } else
+ result = PTR_ERR(env);
+ if (result == 0) {
+ ols->ols_flush = 1;
+ LINVRNT(!osc_lock_has_pages(ols));
+ }
+ return result;
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ * - invalidates all pages protected by this lock (after sending dirty
+ * ones to the server, as necessary);
+ *
+ * - decref's underlying ldlm lock;
+ *
+ * - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct cl_lock *lock = slice->cls_lock;
+ struct osc_lock *olck = cl2osc_lock(slice);
+ struct ldlm_lock *dlmlock = olck->ols_lock;
+ int result = 0;
+ int discard;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+ LINVRNT(osc_lock_invariant(olck));
+
+ if (dlmlock != NULL) {
+ int do_cancel;
+
+ discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA);
+ if (olck->ols_state >= OLS_GRANTED)
+ result = osc_lock_flush(olck, discard);
+ osc_lock_unhold(olck);
+
+ lock_res_and_lock(dlmlock);
+ /* Now that we're the only user of dlm read/write reference,
+ * mostly the ->l_readers + ->l_writers should be zero.
+ * However, there is a corner case.
+ * See bug 18829 for details.*/
+ do_cancel = (dlmlock->l_readers == 0 &&
+ dlmlock->l_writers == 0);
+ dlmlock->l_flags |= LDLM_FL_CBPENDING;
+ unlock_res_and_lock(dlmlock);
+ if (do_cancel)
+ result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC);
+ if (result < 0)
+ CL_LOCK_DEBUG(D_ERROR, env, lock,
+ "lock %p cancel failure with error(%d)\n",
+ lock, result);
+ }
+ olck->ols_state = OLS_CANCELLED;
+ olck->ols_flags &= ~LDLM_FL_LVB_READY;
+ osc_lock_detach(env, olck);
+}
+
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+ return 0;
+}
+
+static void osc_lock_delete(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct osc_lock *olck;
+
+ olck = cl2osc_lock(slice);
+ if (olck->ols_glimpse) {
+ LASSERT(!olck->ols_hold);
+ LASSERT(!olck->ols_lock);
+ return;
+ }
+
+ LINVRNT(osc_lock_invariant(olck));
+ LINVRNT(!osc_lock_has_pages(olck));
+
+ osc_lock_unhold(olck);
+ osc_lock_detach(env, olck);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for osc layer.
+ *
+ * Maintains osc_lock::ols_owner field.
+ *
+ * This assumes that lock always enters CLS_HELD (from some other state) in
+ * the same IO context as one that requested the lock. This should not be a
+ * problem, because context is by definition shared by all activity pertaining
+ * to the same high-level IO.
+ */
+static void osc_lock_state(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state state)
+{
+ struct osc_lock *lock = cl2osc_lock(slice);
+
+ /*
+ * XXX multiple io contexts can use the lock at the same time.
+ */
+ LINVRNT(osc_lock_invariant(lock));
+ if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) {
+ struct osc_io *oio = osc_env_io(env);
+
+ LASSERT(lock->ols_owner == NULL);
+ lock->ols_owner = oio;
+ } else if (state != CLS_HELD)
+ lock->ols_owner = NULL;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct cl_lock_slice *slice)
+{
+ struct osc_lock *lock = cl2osc_lock(slice);
+
+ /*
+ * XXX print ldlm lock and einfo properly.
+ */
+ (*p)(env, cookie, "%p %#16llx %#llx %d %p ",
+ lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie,
+ lock->ols_state, lock->ols_owner);
+ osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+ return 0;
+}
+
+static int osc_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io)
+{
+ struct osc_lock *ols = cl2osc_lock(slice);
+
+ if (need->cld_enq_flags & CEF_NEVER)
+ return 0;
+
+ if (ols->ols_state >= OLS_CANCELLED)
+ return 0;
+
+ if (need->cld_mode == CLM_PHANTOM) {
+ if (ols->ols_agl)
+ return !(ols->ols_state > OLS_RELEASED);
+
+ /*
+ * Note: the QUEUED lock can't be matched here, otherwise
+ * it might cause the deadlocks.
+ * In read_process,
+ * P1: enqueued read lock, create sublock1
+ * P2: enqueued write lock, create sublock2(conflicted
+ * with sublock1).
+ * P1: Grant read lock.
+ * P1: enqueued glimpse lock(with holding sublock1_read),
+ * matched with sublock2, waiting sublock2 to be granted.
+ * But sublock2 can not be granted, because P1
+ * will not release sublock1. Bang!
+ */
+ if (ols->ols_state < OLS_GRANTED ||
+ ols->ols_state > OLS_RELEASED)
+ return 0;
+ } else if (need->cld_enq_flags & CEF_MUST) {
+ /*
+ * If the lock hasn't ever enqueued, it can't be matched
+ * because enqueue process brings in many information
+ * which can be used to determine things such as lockless,
+ * CEF_MUST, etc.
+ */
+ if (ols->ols_state < OLS_UPCALL_RECEIVED &&
+ ols->ols_locklessable)
+ return 0;
+ }
+ return 1;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+ .clo_fini = osc_lock_fini,
+ .clo_enqueue = osc_lock_enqueue,
+ .clo_wait = osc_lock_wait,
+ .clo_unuse = osc_lock_unuse,
+ .clo_use = osc_lock_use,
+ .clo_delete = osc_lock_delete,
+ .clo_state = osc_lock_state,
+ .clo_cancel = osc_lock_cancel,
+ .clo_weigh = osc_lock_weigh,
+ .clo_print = osc_lock_print,
+ .clo_fits_into = osc_lock_fits_into,
+};
+
+static int osc_lock_lockless_unuse(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct osc_lock *ols = cl2osc_lock(slice);
+ struct cl_lock *lock = slice->cls_lock;
+
+ LASSERT(ols->ols_state == OLS_GRANTED);
+ LINVRNT(osc_lock_invariant(ols));
+
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+ return 0;
+}
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct osc_lock *ols = cl2osc_lock(slice);
+ int result;
+
+ result = osc_lock_flush(ols, 0);
+ if (result)
+ CERROR("Pages for lockless lock %p were not purged(%d)\n",
+ ols, result);
+ ols->ols_state = OLS_CANCELLED;
+}
+
+static int osc_lock_lockless_wait(const struct lu_env *env,
+ const struct cl_lock_slice *slice)
+{
+ struct osc_lock *olck = cl2osc_lock(slice);
+ struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+ LINVRNT(osc_lock_invariant(olck));
+ LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED);
+
+ return lock->cll_error;
+}
+
+static void osc_lock_lockless_state(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ enum cl_lock_state state)
+{
+ struct osc_lock *lock = cl2osc_lock(slice);
+
+ LINVRNT(osc_lock_invariant(lock));
+ if (state == CLS_HELD) {
+ struct osc_io *oio = osc_env_io(env);
+
+ LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio));
+ lock->ols_owner = oio;
+
+ /* set the io to be lockless if this lock is for io's
+ * host object */
+ if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj))
+ oio->oi_lockless = 1;
+ }
+}
+
+static int osc_lock_lockless_fits_into(const struct lu_env *env,
+ const struct cl_lock_slice *slice,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io)
+{
+ struct osc_lock *lock = cl2osc_lock(slice);
+
+ if (!(need->cld_enq_flags & CEF_NEVER))
+ return 0;
+
+ /* lockless lock should only be used by its owning io. b22147 */
+ return (lock->ols_owner == osc_env_io(env));
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+ .clo_fini = osc_lock_fini,
+ .clo_enqueue = osc_lock_enqueue,
+ .clo_wait = osc_lock_lockless_wait,
+ .clo_unuse = osc_lock_lockless_unuse,
+ .clo_state = osc_lock_lockless_state,
+ .clo_fits_into = osc_lock_lockless_fits_into,
+ .clo_cancel = osc_lock_lockless_cancel,
+ .clo_print = osc_lock_print
+};
+
+int osc_lock_init(const struct lu_env *env,
+ struct cl_object *obj, struct cl_lock *lock,
+ const struct cl_io *unused)
+{
+ struct osc_lock *clk;
+ int result;
+
+ OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, GFP_NOFS);
+ if (clk != NULL) {
+ __u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+ osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo);
+ atomic_set(&clk->ols_pageref, 0);
+ clk->ols_state = OLS_NEW;
+
+ clk->ols_flags = osc_enq2ldlm_flags(enqflags);
+ clk->ols_agl = !!(enqflags & CEF_AGL);
+ if (clk->ols_agl)
+ clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+ if (clk->ols_flags & LDLM_FL_HAS_INTENT)
+ clk->ols_glimpse = 1;
+
+ cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops);
+
+ if (!(enqflags & CEF_MUST))
+ /* try to convert this lock to a lockless lock */
+ osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER));
+ if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+ clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+ LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n",
+ lock, clk, clk->ols_flags);
+
+ result = 0;
+ } else
+ result = -ENOMEM;
+ return result;
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm)
+{
+ struct osc_lock *olock;
+ int rc = 0;
+
+ spin_lock(&osc_ast_guard);
+ olock = dlm->l_ast_data;
+ /*
+ * there's a very rare race with osc_page_addref_lock(), but that
+ * doesn't matter because in the worst case we don't cancel a lock
+ * which we actually can, that's no harm.
+ */
+ if (olock != NULL &&
+ atomic_add_return(_PAGEREF_MAGIC,
+ &olock->ols_pageref) != _PAGEREF_MAGIC) {
+ atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref);
+ rc = 1;
+ }
+ spin_unlock(&osc_ast_guard);
+ return rc;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c
new file mode 100644
index 000000000..92c202f70
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_object.c
@@ -0,0 +1,271 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+ return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+ LINVRNT(osc_is_object(obj));
+ return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf)
+{
+ struct osc_object *osc = lu2osc(obj);
+ const struct cl_object_conf *cconf = lu2cl_conf(conf);
+ int i;
+
+ osc->oo_oinfo = cconf->u.coc_oinfo;
+ spin_lock_init(&osc->oo_seatbelt);
+ for (i = 0; i < CRT_NR; ++i)
+ INIT_LIST_HEAD(&osc->oo_inflight[i]);
+
+ INIT_LIST_HEAD(&osc->oo_ready_item);
+ INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+ INIT_LIST_HEAD(&osc->oo_write_item);
+ INIT_LIST_HEAD(&osc->oo_read_item);
+
+ osc->oo_root.rb_node = NULL;
+ INIT_LIST_HEAD(&osc->oo_hp_exts);
+ INIT_LIST_HEAD(&osc->oo_urgent_exts);
+ INIT_LIST_HEAD(&osc->oo_rpc_exts);
+ INIT_LIST_HEAD(&osc->oo_reading_exts);
+ atomic_set(&osc->oo_nr_reads, 0);
+ atomic_set(&osc->oo_nr_writes, 0);
+ spin_lock_init(&osc->oo_lock);
+
+ cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+ return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+ struct osc_object *osc = lu2osc(obj);
+ int i;
+
+ for (i = 0; i < CRT_NR; ++i)
+ LASSERT(list_empty(&osc->oo_inflight[i]));
+
+ LASSERT(list_empty(&osc->oo_ready_item));
+ LASSERT(list_empty(&osc->oo_hp_ready_item));
+ LASSERT(list_empty(&osc->oo_write_item));
+ LASSERT(list_empty(&osc->oo_read_item));
+
+ LASSERT(osc->oo_root.rb_node == NULL);
+ LASSERT(list_empty(&osc->oo_hp_exts));
+ LASSERT(list_empty(&osc->oo_urgent_exts));
+ LASSERT(list_empty(&osc->oo_rpc_exts));
+ LASSERT(list_empty(&osc->oo_reading_exts));
+ LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+ LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+
+ lu_object_fini(obj);
+ OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct ost_lvb *lvb)
+{
+ return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu",
+ lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+ lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *obj)
+{
+ struct osc_object *osc = lu2osc(obj);
+ struct lov_oinfo *oinfo = osc->oo_oinfo;
+ struct osc_async_rc *ar = &oinfo->loi_ar;
+
+ (*p)(env, cookie, "id: " DOSTID " idx: %d gen: %d kms_valid: %u kms %llu rc: %d force_sync: %d min_xid: %llu ",
+ POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+ oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+ ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+ osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+ return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr)
+{
+ struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+ cl_lvb2attr(attr, &oinfo->loi_lvb);
+ attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+ return 0;
+}
+
+int osc_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid)
+{
+ struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+ struct ost_lvb *lvb = &oinfo->loi_lvb;
+
+ if (valid & CAT_SIZE)
+ lvb->lvb_size = attr->cat_size;
+ if (valid & CAT_MTIME)
+ lvb->lvb_mtime = attr->cat_mtime;
+ if (valid & CAT_ATIME)
+ lvb->lvb_atime = attr->cat_atime;
+ if (valid & CAT_CTIME)
+ lvb->lvb_ctime = attr->cat_ctime;
+ if (valid & CAT_BLOCKS)
+ lvb->lvb_blocks = attr->cat_blocks;
+ if (valid & CAT_KMS) {
+ CDEBUG(D_CACHE, "set kms from %llu to %llu\n",
+ oinfo->loi_kms, (__u64)attr->cat_kms);
+ loi_kms_set(oinfo, attr->cat_kms);
+ }
+ return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+ const struct cl_object *obj, struct ost_lvb *lvb)
+{
+ struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+ lvb->lvb_size = oinfo->loi_kms;
+ lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+ return 0;
+}
+
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+ obj->oo_contention_time = cfs_time_current();
+ /* mb(); */
+ obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+ obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+ struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+ int osc_contention_time = dev->od_contention_time;
+ unsigned long cur_time = cfs_time_current();
+ unsigned long retry_time;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+ return 1;
+
+ if (!obj->oo_contended)
+ return 0;
+
+ /*
+ * I like copy-paste. the code is copied from
+ * ll_file_is_contended.
+ */
+ retry_time = cfs_time_add(obj->oo_contention_time,
+ cfs_time_seconds(osc_contention_time));
+ if (cfs_time_after(cur_time, retry_time)) {
+ osc_object_clear_contended(obj);
+ return 0;
+ }
+ return 1;
+}
+
+static const struct cl_object_operations osc_ops = {
+ .coo_page_init = osc_page_init,
+ .coo_lock_init = osc_lock_init,
+ .coo_io_init = osc_io_init,
+ .coo_attr_get = osc_attr_get,
+ .coo_attr_set = osc_attr_set,
+ .coo_glimpse = osc_object_glimpse
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+ .loo_object_init = osc_object_init,
+ .loo_object_delete = NULL,
+ .loo_object_release = NULL,
+ .loo_object_free = osc_object_free,
+ .loo_object_print = osc_object_print,
+ .loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *unused,
+ struct lu_device *dev)
+{
+ struct osc_object *osc;
+ struct lu_object *obj;
+
+ OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, GFP_NOFS);
+ if (osc != NULL) {
+ obj = osc2lu(osc);
+ lu_object_init(obj, NULL, dev);
+ osc->oo_cl.co_ops = &osc_ops;
+ obj->lo_ops = &osc_lu_obj_ops;
+ } else
+ obj = NULL;
+ return obj;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
new file mode 100644
index 000000000..76ba58b09
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_page.c
@@ -0,0 +1,916 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del);
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+ struct osc_page *opg);
+
+/** \addtogroup osc
+ * @{
+ */
+
+/*
+ * Comment out osc_page_protected because it may sleep inside the
+ * the client_obd_list_lock.
+ * client_obd_list_lock -> osc_ap_completion -> osc_completion ->
+ * -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base
+ * -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep.
+ */
+#if 0
+static int osc_page_is_dlocked(const struct lu_env *env,
+ const struct osc_page *opg,
+ enum cl_lock_mode mode, int pending, int unref)
+{
+ struct cl_page *page;
+ struct osc_object *obj;
+ struct osc_thread_info *info;
+ struct ldlm_res_id *resname;
+ struct lustre_handle *lockh;
+ ldlm_policy_data_t *policy;
+ ldlm_mode_t dlmmode;
+ __u64 flags;
+
+ might_sleep();
+
+ info = osc_env_info(env);
+ resname = &info->oti_resname;
+ policy = &info->oti_policy;
+ lockh = &info->oti_handle;
+ page = opg->ops_cl.cpl_page;
+ obj = cl2osc(opg->ops_cl.cpl_obj);
+
+ flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
+ if (pending)
+ flags |= LDLM_FL_CBPENDING;
+
+ dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW;
+ osc_lock_build_res(env, obj, resname);
+ osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index);
+ return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+ dlmmode, &flags, NULL, lockh, unref);
+}
+
+/**
+ * Checks an invariant that a page in the cache is covered by a lock, as
+ * needed.
+ */
+static int osc_page_protected(const struct lu_env *env,
+ const struct osc_page *opg,
+ enum cl_lock_mode mode, int unref)
+{
+ struct cl_object_header *hdr;
+ struct cl_lock *scan;
+ struct cl_page *page;
+ struct cl_lock_descr *descr;
+ int result;
+
+ LINVRNT(!opg->ops_temp);
+
+ page = opg->ops_cl.cpl_page;
+ if (page->cp_owner != NULL &&
+ cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER)
+ /*
+ * If IO is done without locks (liblustre, or lloop), lock is
+ * not required.
+ */
+ result = 1;
+ else
+ /* otherwise check for a DLM lock */
+ result = osc_page_is_dlocked(env, opg, mode, 1, unref);
+ if (result == 0) {
+ /* maybe this page is a part of a lockless io? */
+ hdr = cl_object_header(opg->ops_cl.cpl_obj);
+ descr = &osc_env_info(env)->oti_descr;
+ descr->cld_mode = mode;
+ descr->cld_start = page->cp_index;
+ descr->cld_end = page->cp_index;
+ spin_lock(&hdr->coh_lock_guard);
+ list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+ /*
+ * Lock-less sub-lock has to be either in HELD state
+ * (when io is actively going on), or in CACHED state,
+ * when top-lock is being unlocked:
+ * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse().
+ */
+ if ((scan->cll_state == CLS_HELD ||
+ scan->cll_state == CLS_CACHED) &&
+ cl_lock_ext_match(&scan->cll_descr, descr)) {
+ struct osc_lock *olck;
+
+ olck = osc_lock_at(scan);
+ result = osc_lock_is_lockless(olck);
+ break;
+ }
+ }
+ spin_unlock(&hdr->coh_lock_guard);
+ }
+ return result;
+}
+#else
+static int osc_page_protected(const struct lu_env *env,
+ const struct osc_page *opg,
+ enum cl_lock_mode mode, int unref)
+{
+ return 1;
+}
+#endif
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void osc_page_fini(const struct lu_env *env,
+ struct cl_page_slice *slice)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ CDEBUG(D_TRACE, "%p\n", opg);
+ LASSERT(opg->ops_lock == NULL);
+}
+
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+ struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+ LASSERT(!opg->ops_transfer_pinned);
+ cl_page_get(page);
+ lu_ref_add_atomic(&page->cp_reference, label, page);
+ opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+ struct osc_page *opg)
+{
+ struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+ if (opg->ops_transfer_pinned) {
+ lu_ref_del(&page->cp_reference, "transfer", page);
+ opg->ops_transfer_pinned = 0;
+ cl_page_put(env, page);
+ }
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+ struct osc_page *opg, enum cl_req_type crt)
+{
+ struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+ /* ops_lru and ops_inflight share the same field, so take it from LRU
+ * first and then use it as inflight. */
+ osc_lru_del(osc_cli(obj), opg, false);
+
+ spin_lock(&obj->oo_seatbelt);
+ list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
+ opg->ops_submitter = current;
+ spin_unlock(&obj->oo_seatbelt);
+}
+
+static int osc_page_cache_add(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io)
+{
+ struct osc_io *oio = osc_env_io(env);
+ struct osc_page *opg = cl2osc_page(slice);
+ int result;
+
+ LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0));
+
+ osc_page_transfer_get(opg, "transfer\0cache");
+ result = osc_queue_async_io(env, io, opg);
+ if (result != 0)
+ osc_page_transfer_put(env, opg);
+ else
+ osc_page_transfer_add(env, opg, CRT_WRITE);
+
+ /* for sync write, kernel will wait for this page to be flushed before
+ * osc_io_end() is called, so release it earlier.
+ * for mkwrite(), it's known there is no further pages. */
+ if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) {
+ if (oio->oi_active != NULL) {
+ osc_extent_release(env, oio->oi_active);
+ oio->oi_active = NULL;
+ }
+ }
+
+ return result;
+}
+
+void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+ pgoff_t start, pgoff_t end)
+{
+ memset(policy, 0, sizeof(*policy));
+ policy->l_extent.start = cl_offset(obj, start);
+ policy->l_extent.end = cl_offset(obj, end + 1) - 1;
+}
+
+static int osc_page_addref_lock(const struct lu_env *env,
+ struct osc_page *opg,
+ struct cl_lock *lock)
+{
+ struct osc_lock *olock;
+ int rc;
+
+ LASSERT(opg->ops_lock == NULL);
+
+ olock = osc_lock_at(lock);
+ if (atomic_inc_return(&olock->ols_pageref) <= 0) {
+ atomic_dec(&olock->ols_pageref);
+ rc = -ENODATA;
+ } else {
+ cl_lock_get(lock);
+ opg->ops_lock = lock;
+ rc = 0;
+ }
+ return rc;
+}
+
+static void osc_page_putref_lock(const struct lu_env *env,
+ struct osc_page *opg)
+{
+ struct cl_lock *lock = opg->ops_lock;
+ struct osc_lock *olock;
+
+ LASSERT(lock != NULL);
+ olock = osc_lock_at(lock);
+
+ atomic_dec(&olock->ols_pageref);
+ opg->ops_lock = NULL;
+
+ cl_lock_put(env, lock);
+}
+
+static int osc_page_is_under_lock(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ struct cl_lock *lock;
+ int result = -ENODATA;
+
+ lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page,
+ NULL, 1, 0);
+ if (lock != NULL) {
+ if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0)
+ result = -EBUSY;
+ cl_lock_put(env, lock);
+ }
+ return result;
+}
+
+static void osc_page_disown(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+
+ if (unlikely(opg->ops_lock))
+ osc_page_putref_lock(env, opg);
+}
+
+static void osc_page_completion_read(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+ if (likely(opg->ops_lock))
+ osc_page_putref_lock(env, opg);
+ osc_lru_add(osc_cli(obj), opg);
+}
+
+static void osc_page_completion_write(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ int ioret)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ struct osc_object *obj = cl2osc(slice->cpl_obj);
+
+ osc_lru_add(osc_cli(obj), opg);
+}
+
+static int osc_page_fail(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *unused)
+{
+ /*
+ * Cached read?
+ */
+ LBUG();
+ return 0;
+}
+
+
+static const char *osc_list(struct list_head *head)
+{
+ return list_empty(head) ? "-" : "+";
+}
+
+static inline unsigned long osc_submit_duration(struct osc_page *opg)
+{
+ if (opg->ops_submit_time == 0)
+ return 0;
+
+ return (cfs_time_current() - opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t printer)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ struct osc_async_page *oap = &opg->ops_oap;
+ struct osc_object *obj = cl2osc(slice->cpl_obj);
+ struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli;
+
+ return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %s %p %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n",
+ opg,
+ /* 1 */
+ oap->oap_magic, oap->oap_cmd,
+ oap->oap_interrupted,
+ osc_list(&oap->oap_pending_item),
+ osc_list(&oap->oap_rpc_item),
+ /* 2 */
+ oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+ oap->oap_async_flags, oap->oap_brw_flags,
+ oap->oap_request, oap->oap_cli, obj,
+ /* 3 */
+ osc_list(&opg->ops_inflight),
+ opg->ops_submitter, opg->ops_transfer_pinned,
+ osc_submit_duration(opg), opg->ops_srvlock,
+ /* 4 */
+ cli->cl_r_in_flight, cli->cl_w_in_flight,
+ cli->cl_max_rpcs_in_flight,
+ cli->cl_avail_grant,
+ osc_list(&cli->cl_cache_waiters),
+ osc_list(&cli->cl_loi_ready_list),
+ osc_list(&cli->cl_loi_hp_ready_list),
+ osc_list(&cli->cl_loi_write_list),
+ osc_list(&cli->cl_loi_read_list),
+ /* 5 */
+ osc_list(&obj->oo_ready_item),
+ osc_list(&obj->oo_hp_ready_item),
+ osc_list(&obj->oo_write_item),
+ osc_list(&obj->oo_read_item),
+ atomic_read(&obj->oo_nr_reads),
+ osc_list(&obj->oo_reading_exts),
+ atomic_read(&obj->oo_nr_writes),
+ osc_list(&obj->oo_hp_exts),
+ osc_list(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+ int rc;
+
+ LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1));
+
+ CDEBUG(D_TRACE, "%p\n", opg);
+ osc_page_transfer_put(env, opg);
+ rc = osc_teardown_async_page(env, obj, opg);
+ if (rc) {
+ CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page),
+ "Trying to teardown failed: %d\n", rc);
+ LASSERT(0);
+ }
+
+ spin_lock(&obj->oo_seatbelt);
+ if (opg->ops_submitter != NULL) {
+ LASSERT(!list_empty(&opg->ops_inflight));
+ list_del_init(&opg->ops_inflight);
+ opg->ops_submitter = NULL;
+ }
+ spin_unlock(&obj->oo_seatbelt);
+
+ osc_lru_del(osc_cli(obj), opg, true);
+}
+
+void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice,
+ int from, int to)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ struct osc_async_page *oap = &opg->ops_oap;
+
+ LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+ opg->ops_from = from;
+ opg->ops_to = to;
+ spin_lock(&oap->oap_lock);
+ oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+ spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+ const struct cl_page_slice *slice)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ int rc = 0;
+
+ LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+ /* Check if the transferring against this page
+ * is completed, or not even queued. */
+ if (opg->ops_transfer_pinned)
+ /* FIXME: may not be interrupted.. */
+ rc = osc_cancel_async_page(env, opg);
+ LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+ return rc;
+}
+
+static int osc_page_flush(const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ struct cl_io *io)
+{
+ struct osc_page *opg = cl2osc_page(slice);
+ int rc = 0;
+
+ rc = osc_flush_async_page(env, io, opg);
+ return rc;
+}
+
+static const struct cl_page_operations osc_page_ops = {
+ .cpo_fini = osc_page_fini,
+ .cpo_print = osc_page_print,
+ .cpo_delete = osc_page_delete,
+ .cpo_is_under_lock = osc_page_is_under_lock,
+ .cpo_disown = osc_page_disown,
+ .io = {
+ [CRT_READ] = {
+ .cpo_cache_add = osc_page_fail,
+ .cpo_completion = osc_page_completion_read
+ },
+ [CRT_WRITE] = {
+ .cpo_cache_add = osc_page_cache_add,
+ .cpo_completion = osc_page_completion_write
+ }
+ },
+ .cpo_clip = osc_page_clip,
+ .cpo_cancel = osc_page_cancel,
+ .cpo_flush = osc_page_flush
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+ struct cl_page *page, struct page *vmpage)
+{
+ struct osc_object *osc = cl2osc(obj);
+ struct osc_page *opg = cl_object_page_slice(obj, page);
+ int result;
+
+ opg->ops_from = 0;
+ opg->ops_to = PAGE_CACHE_SIZE;
+
+ result = osc_prep_async_page(osc, opg, vmpage,
+ cl_offset(obj, page->cp_index));
+ if (result == 0) {
+ struct osc_io *oio = osc_env_io(env);
+ opg->ops_srvlock = osc_io_srvlock(oio);
+ cl_page_slice_add(page, &opg->ops_cl, obj,
+ &osc_page_ops);
+ }
+ /*
+ * Cannot assert osc_page_protected() here as read-ahead
+ * creates temporary pages outside of a lock.
+ */
+ /* ops_inflight and ops_lru are the same field, but it doesn't
+ * hurt to initialize it twice :-) */
+ INIT_LIST_HEAD(&opg->ops_inflight);
+ INIT_LIST_HEAD(&opg->ops_lru);
+
+ /* reserve an LRU space for this page */
+ if (page->cp_type == CPT_CACHEABLE && result == 0)
+ result = osc_lru_reserve(env, osc, opg);
+
+ return result;
+}
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+ enum cl_req_type crt, int brw_flags)
+{
+ struct osc_async_page *oap = &opg->ops_oap;
+ struct osc_object *obj = oap->oap_obj;
+
+ LINVRNT(osc_page_protected(env, opg,
+ crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1));
+
+ LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, magic 0x%x\n",
+ oap, oap->oap_magic);
+ LASSERT(oap->oap_async_flags & ASYNC_READY);
+ LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+ oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+ oap->oap_page_off = opg->ops_from;
+ oap->oap_count = opg->ops_to - opg->ops_from;
+ oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+ if (!client_is_remote(osc_export(obj)) &&
+ capable(CFS_CAP_SYS_RESOURCE)) {
+ oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+ oap->oap_cmd |= OBD_BRW_NOQUOTA;
+ }
+
+ opg->ops_submit_time = cfs_time_current();
+ osc_page_transfer_get(opg, "transfer\0imm");
+ osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq);
+static atomic_t osc_lru_waiters = ATOMIC_INIT(0);
+/* LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU budget, and.. */
+static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT); /* 2M */
+/* free this number at most otherwise it will take too long time to finish. */
+static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */
+
+/* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed. */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+ struct cl_client_cache *cache = cli->cl_cache;
+ int pages = atomic_read(&cli->cl_lru_in_list) >> 1;
+
+ if (atomic_read(&osc_lru_waiters) > 0 &&
+ atomic_read(cli->cl_lru_left) < lru_shrink_max)
+ /* drop lru pages aggressively */
+ return min(pages, lru_shrink_max);
+
+ /* if it's going to run out LRU slots, we should free some, but not
+ * too much to maintain fairness among OSCs. */
+ if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) {
+ unsigned long tmp;
+
+ tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users);
+ if (pages > tmp)
+ return min(pages, lru_shrink_max);
+
+ return pages > lru_shrink_min ? lru_shrink_min : 0;
+ }
+
+ return 0;
+}
+
+/* Return how many pages are not discarded in @pvec. */
+static int discard_pagevec(const struct lu_env *env, struct cl_io *io,
+ struct cl_page **pvec, int max_index)
+{
+ int count;
+ int i;
+
+ for (count = 0, i = 0; i < max_index; i++) {
+ struct cl_page *page = pvec[i];
+ if (cl_page_own_try(env, io, page) == 0) {
+ /* free LRU page only if nobody is using it.
+ * This check is necessary to avoid freeing the pages
+ * having already been removed from LRU and pinned
+ * for IO. */
+ if (!cl_page_in_use(page)) {
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ ++count;
+ }
+ cl_page_disown(env, io, page);
+ }
+ cl_page_put(env, page);
+ pvec[i] = NULL;
+ }
+ return max_index - count;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+int osc_lru_shrink(struct client_obd *cli, int target)
+{
+ struct cl_env_nest nest;
+ struct lu_env *env;
+ struct cl_io *io;
+ struct cl_object *clobj = NULL;
+ struct cl_page **pvec;
+ struct osc_page *opg;
+ int maxscan = 0;
+ int count = 0;
+ int index = 0;
+ int rc = 0;
+
+ LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0);
+ if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+ return 0;
+
+ env = cl_env_nested_get(&nest);
+ if (IS_ERR(env))
+ return PTR_ERR(env);
+
+ pvec = osc_env_info(env)->oti_pvec;
+ io = &osc_env_info(env)->oti_io;
+
+ client_obd_list_lock(&cli->cl_lru_list_lock);
+ atomic_inc(&cli->cl_lru_shrinkers);
+ maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list));
+ while (!list_empty(&cli->cl_lru_list)) {
+ struct cl_page *page;
+
+ if (--maxscan < 0)
+ break;
+
+ opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+ ops_lru);
+ page = cl_page_top(opg->ops_cl.cpl_page);
+ if (cl_page_in_use_noref(page)) {
+ list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+ continue;
+ }
+
+ LASSERT(page->cp_obj != NULL);
+ if (clobj != page->cp_obj) {
+ struct cl_object *tmp = page->cp_obj;
+
+ cl_object_get(tmp);
+ client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+ if (clobj != NULL) {
+ count -= discard_pagevec(env, io, pvec, index);
+ index = 0;
+
+ cl_io_fini(env, io);
+ cl_object_put(env, clobj);
+ clobj = NULL;
+ }
+
+ clobj = tmp;
+ io->ci_obj = clobj;
+ io->ci_ignore_layout = 1;
+ rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+ client_obd_list_lock(&cli->cl_lru_list_lock);
+
+ if (rc != 0)
+ break;
+
+ ++maxscan;
+ continue;
+ }
+
+ /* move this page to the end of list as it will be discarded
+ * soon. The page will be finally removed from LRU list in
+ * osc_page_delete(). */
+ list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+ /* it's okay to grab a refcount here w/o holding lock because
+ * it has to grab cl_lru_list_lock to delete the page. */
+ cl_page_get(page);
+ pvec[index++] = page;
+ if (++count >= target)
+ break;
+
+ if (unlikely(index == OTI_PVEC_SIZE)) {
+ client_obd_list_unlock(&cli->cl_lru_list_lock);
+ count -= discard_pagevec(env, io, pvec, index);
+ index = 0;
+
+ client_obd_list_lock(&cli->cl_lru_list_lock);
+ }
+ }
+ client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+ if (clobj != NULL) {
+ count -= discard_pagevec(env, io, pvec, index);
+
+ cl_io_fini(env, io);
+ cl_object_put(env, clobj);
+ }
+ cl_env_nested_put(&nest, env);
+
+ atomic_dec(&cli->cl_lru_shrinkers);
+ return count > 0 ? count : rc;
+}
+
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg)
+{
+ bool wakeup = false;
+
+ if (!opg->ops_in_lru)
+ return;
+
+ atomic_dec(&cli->cl_lru_busy);
+ client_obd_list_lock(&cli->cl_lru_list_lock);
+ if (list_empty(&opg->ops_lru)) {
+ list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+ atomic_inc_return(&cli->cl_lru_in_list);
+ wakeup = atomic_read(&osc_lru_waiters) > 0;
+ }
+ client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+ if (wakeup) {
+ osc_lru_shrink(cli, osc_cache_too_much(cli));
+ wake_up_all(&osc_lru_waitq);
+ }
+}
+
+/* delete page from LRUlist. The page can be deleted from LRUlist for two
+ * reasons: redirtied or deleted from page cache. */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del)
+{
+ if (opg->ops_in_lru) {
+ client_obd_list_lock(&cli->cl_lru_list_lock);
+ if (!list_empty(&opg->ops_lru)) {
+ LASSERT(atomic_read(&cli->cl_lru_in_list) > 0);
+ list_del_init(&opg->ops_lru);
+ atomic_dec(&cli->cl_lru_in_list);
+ if (!del)
+ atomic_inc(&cli->cl_lru_busy);
+ } else if (del) {
+ LASSERT(atomic_read(&cli->cl_lru_busy) > 0);
+ atomic_dec(&cli->cl_lru_busy);
+ }
+ client_obd_list_unlock(&cli->cl_lru_list_lock);
+ if (del) {
+ atomic_inc(cli->cl_lru_left);
+ /* this is a great place to release more LRU pages if
+ * this osc occupies too many LRU pages and kernel is
+ * stealing one of them.
+ * cl_lru_shrinkers is to avoid recursive call in case
+ * we're already in the context of osc_lru_shrink(). */
+ if (atomic_read(&cli->cl_lru_shrinkers) == 0 &&
+ !memory_pressure_get())
+ osc_lru_shrink(cli, osc_cache_too_much(cli));
+ wake_up(&osc_lru_waitq);
+ }
+ } else {
+ LASSERT(list_empty(&opg->ops_lru));
+ }
+}
+
+static inline int max_to_shrink(struct client_obd *cli)
+{
+ return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max);
+}
+
+static int osc_lru_reclaim(struct client_obd *cli)
+{
+ struct cl_client_cache *cache = cli->cl_cache;
+ int max_scans;
+ int rc;
+
+ LASSERT(cache != NULL);
+ LASSERT(!list_empty(&cache->ccc_lru));
+
+ rc = osc_lru_shrink(cli, lru_shrink_min);
+ if (rc != 0) {
+ CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n",
+ cli->cl_import->imp_obd->obd_name, rc, cli);
+ return rc;
+ }
+
+ CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n",
+ cli->cl_import->imp_obd->obd_name, cli,
+ atomic_read(&cli->cl_lru_in_list),
+ atomic_read(&cli->cl_lru_busy));
+
+ /* Reclaim LRU slots from other client_obd as it can't free enough
+ * from its own. This should rarely happen. */
+ spin_lock(&cache->ccc_lru_lock);
+ cache->ccc_lru_shrinkers++;
+ list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+ max_scans = atomic_read(&cache->ccc_users);
+ while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
+ cli = list_entry(cache->ccc_lru.next, struct client_obd,
+ cl_lru_osc);
+
+ CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n",
+ cli->cl_import->imp_obd->obd_name, cli,
+ atomic_read(&cli->cl_lru_in_list),
+ atomic_read(&cli->cl_lru_busy));
+
+ list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+ if (atomic_read(&cli->cl_lru_in_list) > 0) {
+ spin_unlock(&cache->ccc_lru_lock);
+
+ rc = osc_lru_shrink(cli, max_to_shrink(cli));
+ spin_lock(&cache->ccc_lru_lock);
+ if (rc != 0)
+ break;
+ }
+ }
+ spin_unlock(&cache->ccc_lru_lock);
+
+ CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n",
+ cli->cl_import->imp_obd->obd_name, cli, rc);
+ return rc;
+}
+
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+ struct osc_page *opg)
+{
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+ struct client_obd *cli = osc_cli(obj);
+ int rc = 0;
+
+ if (cli->cl_cache == NULL) /* shall not be in LRU */
+ return 0;
+
+ LASSERT(atomic_read(cli->cl_lru_left) >= 0);
+ while (!atomic_add_unless(cli->cl_lru_left, -1, 0)) {
+ int gen;
+
+ /* run out of LRU spaces, try to drop some by itself */
+ rc = osc_lru_reclaim(cli);
+ if (rc < 0)
+ break;
+ if (rc > 0)
+ continue;
+
+ cond_resched();
+
+ /* slowest case, all of caching pages are busy, notifying
+ * other OSCs that we're lack of LRU slots. */
+ atomic_inc(&osc_lru_waiters);
+
+ gen = atomic_read(&cli->cl_lru_in_list);
+ rc = l_wait_event(osc_lru_waitq,
+ atomic_read(cli->cl_lru_left) > 0 ||
+ (atomic_read(&cli->cl_lru_in_list) > 0 &&
+ gen != atomic_read(&cli->cl_lru_in_list)),
+ &lwi);
+
+ atomic_dec(&osc_lru_waiters);
+ if (rc < 0)
+ break;
+ }
+
+ if (rc >= 0) {
+ atomic_inc(&cli->cl_lru_busy);
+ opg->ops_in_lru = 1;
+ rc = 0;
+ }
+
+ return rc;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c
new file mode 100644
index 000000000..6690f149a
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_quota.c
@@ -0,0 +1,327 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include "../include/obd_class.h"
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(u32 id)
+{
+ struct osc_quota_info *oqi;
+
+ OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+ if (oqi != NULL)
+ oqi->oqi_id = id;
+
+ return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ struct osc_quota_info *oqi;
+
+ oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+ if (oqi) {
+ /* do not try to access oqi here, it could have been
+ * freed by osc_quota_setdq() */
+
+ /* the slot is busy, the user is about to run out of
+ * quota space on this OST */
+ CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+ type == USRQUOTA ? "user" : "grout", qid[type]);
+ return NO_QUOTA;
+ }
+ }
+
+ return QUOTA_OK;
+}
+
+#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \
+ : OBD_MD_FLGRPQUOTA)
+#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \
+ : OBD_FL_NO_GRPQUOTA)
+
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+ u32 valid, u32 flags)
+{
+ int type;
+ int rc = 0;
+
+ if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0)
+ return 0;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ struct osc_quota_info *oqi;
+
+ if ((valid & MD_QUOTA_FLAG(type)) == 0)
+ continue;
+
+ /* lookup the ID in the per-type hash table */
+ oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+ if ((flags & FL_QUOTA_FLAG(type)) != 0) {
+ /* This ID is getting close to its quota limit, let's
+ * switch to sync I/O */
+ if (oqi != NULL)
+ continue;
+
+ oqi = osc_oqi_alloc(qid[type]);
+ if (oqi == NULL) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+ &qid[type], &oqi->oqi_hash);
+ /* race with others? */
+ if (rc == -EALREADY) {
+ rc = 0;
+ OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+ }
+
+ CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+ cli->cl_import->imp_obd->obd_name,
+ type == USRQUOTA ? "user" : "group",
+ qid[type], rc);
+ } else {
+ /* This ID is now off the hook, let's remove it from
+ * the hash table */
+ if (oqi == NULL)
+ continue;
+
+ oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+ &qid[type]);
+ if (oqi)
+ OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+ CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+ cli->cl_import->imp_obd->obd_name,
+ type == USRQUOTA ? "user" : "group",
+ qid[type], oqi);
+ }
+ }
+
+ return rc;
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_u32_hash(*((__u32 *)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct osc_quota_info *oqi;
+ u32 uid;
+
+ LASSERT(key != NULL);
+ uid = *((u32 *)key);
+ oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+ return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+ struct osc_quota_info *oqi;
+ oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+ return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct osc_quota_info *oqi;
+
+ oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+ OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static cfs_hash_ops_t quota_hash_ops = {
+ .hs_hash = oqi_hashfn,
+ .hs_keycmp = oqi_keycmp,
+ .hs_key = oqi_key,
+ .hs_object = oqi_object,
+ .hs_get = oqi_get,
+ .hs_put_locked = oqi_put_locked,
+ .hs_exit = oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+ struct client_obd *cli = &obd->u.cli;
+ int i, type;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+ HASH_QUOTA_CUR_BITS,
+ HASH_QUOTA_MAX_BITS,
+ HASH_QUOTA_BKT_BITS,
+ 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &quota_hash_ops,
+ CFS_HASH_DEFAULT);
+ if (cli->cl_quota_hash[type] == NULL)
+ break;
+ }
+
+ if (type == MAXQUOTAS)
+ return 0;
+
+ for (i = 0; i < type; i++)
+ cfs_hash_putref(cli->cl_quota_hash[i]);
+
+ return -ENOMEM;
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+ struct client_obd *cli = &obd->u.cli;
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ cfs_hash_putref(cli->cl_quota_hash[type]);
+
+ return 0;
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct ptlrpc_request *req;
+ struct obd_quotactl *oqc;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+ OST_QUOTACTL);
+ if (req == NULL)
+ return -ENOMEM;
+
+ oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+ *oqc = *oqctl;
+
+ ptlrpc_request_set_replen(req);
+ ptlrpc_at_set_req_timeout(req);
+ req->rq_no_resend = 1;
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+ if (req->rq_repmsg) {
+ oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+ if (oqc) {
+ *oqctl = *oqc;
+ } else if (!rc) {
+ CERROR("Can't unpack obd_quotactl\n");
+ rc = -EPROTO;
+ }
+ } else if (!rc) {
+ CERROR("Can't unpack obd_quotactl\n");
+ rc = -EPROTO;
+ }
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ struct ptlrpc_request *req;
+ struct obd_quotactl *body;
+ int rc;
+
+ req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+ &RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION,
+ OST_QUOTACHECK);
+ if (req == NULL)
+ return -ENOMEM;
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+ *body = *oqctl;
+
+ ptlrpc_request_set_replen(req);
+
+ /* the next poll will find -ENODATA, that means quotacheck is
+ * going on */
+ cli->cl_qchk_stat = -ENODATA;
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ cli->cl_qchk_stat = rc;
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ int rc;
+
+ qchk->obd_uuid = cli->cl_target_uuid;
+ memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME));
+
+ rc = cli->cl_qchk_stat;
+ /* the client is not the previous one */
+ if (rc == CL_NOT_QUOTACHECKED)
+ rc = -EINTR;
+ return rc;
+}
diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
new file mode 100644
index 000000000..d7a9b650d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/osc/osc_request.c
@@ -0,0 +1,3379 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre/lustre_user.h"
+#include "../include/obd_cksum.h"
+
+#include "../include/lustre_ha.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre_debug.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_fid.h"
+#include "../include/obd_class.h"
+#include "osc_internal.h"
+#include "osc_cl_internal.h"
+
+struct osc_brw_async_args {
+ struct obdo *aa_oa;
+ int aa_requested_nob;
+ int aa_nio_count;
+ u32 aa_page_count;
+ int aa_resends;
+ struct brw_page **aa_ppga;
+ struct client_obd *aa_cli;
+ struct list_head aa_oaps;
+ struct list_head aa_exts;
+ struct obd_capa *aa_ocapa;
+ struct cl_req *aa_clerq;
+};
+
+struct osc_async_args {
+ struct obd_info *aa_oi;
+};
+
+struct osc_setattr_args {
+ struct obdo *sa_oa;
+ obd_enqueue_update_f sa_upcall;
+ void *sa_cookie;
+};
+
+struct osc_fsync_args {
+ struct obd_info *fa_oi;
+ obd_enqueue_update_f fa_upcall;
+ void *fa_cookie;
+};
+
+struct osc_enqueue_args {
+ struct obd_export *oa_exp;
+ __u64 *oa_flags;
+ obd_enqueue_update_f oa_upcall;
+ void *oa_cookie;
+ struct ost_lvb *oa_lvb;
+ struct lustre_handle *oa_lockh;
+ struct ldlm_enqueue_info *oa_ei;
+ unsigned int oa_agl:1;
+};
+
+static void osc_release_ppga(struct brw_page **ppga, u32 count);
+static int brw_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req, void *data, int rc);
+int osc_cleanup(struct obd_device *obd);
+
+/* Pack OSC object metadata for disk storage (LE byte order). */
+static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+ struct lov_stripe_md *lsm)
+{
+ int lmm_size;
+
+ lmm_size = sizeof(**lmmp);
+ if (lmmp == NULL)
+ return lmm_size;
+
+ if (*lmmp != NULL && lsm == NULL) {
+ OBD_FREE(*lmmp, lmm_size);
+ *lmmp = NULL;
+ return 0;
+ } else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) {
+ return -EBADF;
+ }
+
+ if (*lmmp == NULL) {
+ OBD_ALLOC(*lmmp, lmm_size);
+ if (*lmmp == NULL)
+ return -ENOMEM;
+ }
+
+ if (lsm)
+ ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi);
+
+ return lmm_size;
+}
+
+/* Unpack OSC object metadata from disk storage (LE byte order). */
+static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+ struct lov_mds_md *lmm, int lmm_bytes)
+{
+ int lsm_size;
+ struct obd_import *imp = class_exp2cliimp(exp);
+
+ if (lmm != NULL) {
+ if (lmm_bytes < sizeof(*lmm)) {
+ CERROR("%s: lov_mds_md too small: %d, need %d\n",
+ exp->exp_obd->obd_name, lmm_bytes,
+ (int)sizeof(*lmm));
+ return -EINVAL;
+ }
+ /* XXX LOV_MAGIC etc check? */
+
+ if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) {
+ CERROR("%s: zero lmm_object_id: rc = %d\n",
+ exp->exp_obd->obd_name, -EINVAL);
+ return -EINVAL;
+ }
+ }
+
+ lsm_size = lov_stripe_md_size(1);
+ if (lsmp == NULL)
+ return lsm_size;
+
+ if (*lsmp != NULL && lmm == NULL) {
+ OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+ OBD_FREE(*lsmp, lsm_size);
+ *lsmp = NULL;
+ return 0;
+ }
+
+ if (*lsmp == NULL) {
+ OBD_ALLOC(*lsmp, lsm_size);
+ if (unlikely(*lsmp == NULL))
+ return -ENOMEM;
+ OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+ if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) {
+ OBD_FREE(*lsmp, lsm_size);
+ return -ENOMEM;
+ }
+ loi_init((*lsmp)->lsm_oinfo[0]);
+ } else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) {
+ return -EBADF;
+ }
+
+ if (lmm != NULL)
+ /* XXX zero *lsmp? */
+ ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi);
+
+ if (imp != NULL &&
+ (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES))
+ (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+ else
+ (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+
+ return lsm_size;
+}
+
+static inline void osc_pack_capa(struct ptlrpc_request *req,
+ struct ost_body *body, void *capa)
+{
+ struct obd_capa *oc = (struct obd_capa *)capa;
+ struct lustre_capa *c;
+
+ if (!capa)
+ return;
+
+ c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1);
+ LASSERT(c);
+ capa_cpy(c, oc);
+ body->oa.o_valid |= OBD_MD_FLOSSCAPA;
+ DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+static inline void osc_pack_req_body(struct ptlrpc_request *req,
+ struct obd_info *oinfo)
+{
+ struct ost_body *body;
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+ LASSERT(body);
+
+ lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+ oinfo->oi_oa);
+ osc_pack_capa(req, body, oinfo->oi_capa);
+}
+
+static inline void osc_set_capa_size(struct ptlrpc_request *req,
+ const struct req_msg_field *field,
+ struct obd_capa *oc)
+{
+ if (oc == NULL)
+ req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+ else
+ /* it is already calculated as sizeof struct obd_capa */
+ ;
+}
+
+static int osc_getattr_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ struct osc_async_args *aa, int rc)
+{
+ struct ost_body *body;
+
+ if (rc != 0)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ if (body) {
+ CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+ lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+ aa->aa_oi->oi_oa, &body->oa);
+
+ /* This should really be sent by the OST */
+ aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE;
+ aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+ } else {
+ CDEBUG(D_INFO, "can't unpack ost_body\n");
+ rc = -EPROTO;
+ aa->aa_oi->oi_oa->o_valid = 0;
+ }
+out:
+ rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+ return rc;
+}
+
+static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+ struct ptlrpc_request_set *set)
+{
+ struct ptlrpc_request *req;
+ struct osc_async_args *aa;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ osc_pack_req_body(req, oinfo);
+
+ ptlrpc_request_set_replen(req);
+ req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret;
+
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ aa->aa_oi = oinfo;
+
+ ptlrpc_set_add_req(set, req);
+ return 0;
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo)
+{
+ struct ptlrpc_request *req;
+ struct ost_body *body;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ osc_pack_req_body(req, oinfo);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ if (body == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+ lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+ &body->oa);
+
+ oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd);
+ oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+
+ out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+ struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+ struct ptlrpc_request *req;
+ struct ost_body *body;
+ int rc;
+
+ LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ osc_pack_req_body(req, oinfo);
+
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ if (body == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+ &body->oa);
+
+out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ struct osc_setattr_args *sa, int rc)
+{
+ struct ost_body *body;
+
+ if (rc != 0)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ if (body == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
+ &body->oa);
+out:
+ rc = sa->sa_upcall(sa->sa_cookie, rc);
+ return rc;
+}
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ obd_enqueue_update_f upcall, void *cookie,
+ struct ptlrpc_request_set *rqset)
+{
+ struct ptlrpc_request *req;
+ struct osc_setattr_args *sa;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+ if (req == NULL)
+ return -ENOMEM;
+
+ osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+ oinfo->oi_oa->o_lcookie = *oti->oti_logcookies;
+
+ osc_pack_req_body(req, oinfo);
+
+ ptlrpc_request_set_replen(req);
+
+ /* do mds to ost setattr asynchronously */
+ if (!rqset) {
+ /* Do not wait for response. */
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+ } else {
+ req->rq_interpret_reply =
+ (ptlrpc_interpterer_t)osc_setattr_interpret;
+
+ CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+ sa = ptlrpc_req_async_args(req);
+ sa->sa_oa = oinfo->oi_oa;
+ sa->sa_upcall = upcall;
+ sa->sa_cookie = cookie;
+
+ if (rqset == PTLRPCD_SET)
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+ else
+ ptlrpc_set_add_req(rqset, req);
+ }
+
+ return 0;
+}
+
+static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+ struct obd_trans_info *oti,
+ struct ptlrpc_request_set *rqset)
+{
+ return osc_setattr_async_base(exp, oinfo, oti,
+ oinfo->oi_cb_up, oinfo, rqset);
+}
+
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+ struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+ struct ptlrpc_request *req;
+ struct ost_body *body;
+ struct lov_stripe_md *lsm;
+ int rc;
+
+ LASSERT(oa);
+ LASSERT(ea);
+
+ lsm = *ea;
+ if (!lsm) {
+ rc = obd_alloc_memmd(exp, &lsm);
+ if (rc < 0)
+ return rc;
+ }
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+ if (rc) {
+ ptlrpc_request_free(req);
+ goto out;
+ }
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+ LASSERT(body);
+
+ lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+ ptlrpc_request_set_replen(req);
+
+ if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+ oa->o_flags == OBD_FL_DELORPHAN) {
+ DEBUG_REQ(D_HA, req,
+ "delorphan from OST integration");
+ /* Don't resend the delorphan req */
+ req->rq_no_resend = req->rq_no_delay = 1;
+ }
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out_req;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ if (body == NULL) {
+ rc = -EPROTO;
+ goto out_req;
+ }
+
+ CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
+ lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+ oa->o_blksize = cli_brw_size(exp->exp_obd);
+ oa->o_valid |= OBD_MD_FLBLKSZ;
+
+ /* XXX LOV STACKING: the lsm that is passed to us from LOV does not
+ * have valid lsm_oinfo data structs, so don't go touching that.
+ * This needs to be fixed in a big way.
+ */
+ lsm->lsm_oi = oa->o_oi;
+ *ea = lsm;
+
+ if (oti != NULL) {
+ oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+
+ if (oa->o_valid & OBD_MD_FLCOOKIE) {
+ if (!oti->oti_logcookies)
+ oti_alloc_cookies(oti, 1);
+ *oti->oti_logcookies = oa->o_lcookie;
+ }
+ }
+
+ CDEBUG(D_HA, "transno: %lld\n",
+ lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+ ptlrpc_req_finished(req);
+out:
+ if (rc && !*ea)
+ obd_free_memmd(exp, &lsm);
+ return rc;
+}
+
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+ obd_enqueue_update_f upcall, void *cookie,
+ struct ptlrpc_request_set *rqset)
+{
+ struct ptlrpc_request *req;
+ struct osc_setattr_args *sa;
+ struct ost_body *body;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+ if (req == NULL)
+ return -ENOMEM;
+
+ osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+ req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+ ptlrpc_at_set_req_timeout(req);
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+ LASSERT(body);
+ lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+ oinfo->oi_oa);
+ osc_pack_capa(req, body, oinfo->oi_capa);
+
+ ptlrpc_request_set_replen(req);
+
+ req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+ CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+ sa = ptlrpc_req_async_args(req);
+ sa->sa_oa = oinfo->oi_oa;
+ sa->sa_upcall = upcall;
+ sa->sa_cookie = cookie;
+ if (rqset == PTLRPCD_SET)
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+ else
+ ptlrpc_set_add_req(rqset, req);
+
+ return 0;
+}
+
+static int osc_sync_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *arg, int rc)
+{
+ struct osc_fsync_args *fa = arg;
+ struct ost_body *body;
+
+ if (rc)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ if (body == NULL) {
+ CERROR ("can't unpack ost_body\n");
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *fa->fa_oi->oi_oa = body->oa;
+out:
+ rc = fa->fa_upcall(fa->fa_cookie, rc);
+ return rc;
+}
+
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+ obd_enqueue_update_f upcall, void *cookie,
+ struct ptlrpc_request_set *rqset)
+{
+ struct ptlrpc_request *req;
+ struct ost_body *body;
+ struct osc_fsync_args *fa;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+ if (req == NULL)
+ return -ENOMEM;
+
+ osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ /* overload the size and blocks fields in the oa with start/end */
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+ LASSERT(body);
+ lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+ oinfo->oi_oa);
+ osc_pack_capa(req, body, oinfo->oi_capa);
+
+ ptlrpc_request_set_replen(req);
+ req->rq_interpret_reply = osc_sync_interpret;
+
+ CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
+ fa = ptlrpc_req_async_args(req);
+ fa->fa_oi = oinfo;
+ fa->fa_upcall = upcall;
+ fa->fa_cookie = cookie;
+
+ if (rqset == PTLRPCD_SET)
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+ else
+ ptlrpc_set_add_req(rqset, req);
+
+ return 0;
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+ struct list_head *cancels,
+ ldlm_mode_t mode, __u64 lock_flags)
+{
+ struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+ struct ldlm_res_id res_id;
+ struct ldlm_resource *res;
+ int count;
+
+ /* Return, i.e. cancel nothing, only if ELC is supported (flag in
+ * export) but disabled through procfs (flag in NS).
+ *
+ * This distinguishes from a case when ELC is not supported originally,
+ * when we still want to cancel locks in advance and just cancel them
+ * locally, without sending any RPC. */
+ if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+ return 0;
+
+ ostid_build_res_name(&oa->o_oi, &res_id);
+ res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+ if (res == NULL)
+ return 0;
+
+ LDLM_RESOURCE_ADDREF(res);
+ count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+ lock_flags, 0, NULL);
+ LDLM_RESOURCE_DELREF(res);
+ ldlm_resource_putref(res);
+ return count;
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req, void *data,
+ int rc)
+{
+ struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+ atomic_dec(&cli->cl_destroy_in_flight);
+ wake_up(&cli->cl_destroy_waitq);
+ return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+ if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+ cli->cl_max_rpcs_in_flight) {
+ /* The destroy request can be sent */
+ return 1;
+ }
+ if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+ cli->cl_max_rpcs_in_flight) {
+ /*
+ * The counter has been modified between the two atomic
+ * operations.
+ */
+ wake_up(&cli->cl_destroy_waitq);
+ }
+ return 0;
+}
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md **ea,
+ struct obd_trans_info *oti)
+{
+ int rc = 0;
+
+ LASSERT(oa);
+ LASSERT(ea);
+ LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+ if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+ oa->o_flags == OBD_FL_RECREATE_OBJS) {
+ return osc_real_create(exp, oa, ea, oti);
+ }
+
+ if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi)))
+ return osc_real_create(exp, oa, ea, oti);
+
+ /* we should not get here anymore */
+ LBUG();
+
+ return rc;
+}
+
+/* Destroy requests can be async always on the client, and we don't even really
+ * care about the return code since the client cannot do anything at all about
+ * a destroy failure.
+ * When the MDS is unlinking a filename, it saves the file objects into a
+ * recovery llog, and these object records are cancelled when the OST reports
+ * they were destroyed and sync'd to disk (i.e. transaction committed).
+ * If the client dies, or the OST is down when the object should be destroyed,
+ * the records are not cancelled, and when the OST reconnects to the MDS next,
+ * it will retrieve the llog unlink logs and then sends the log cancellation
+ * cookies to the MDS after committing destroy transactions. */
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+ struct obdo *oa, struct lov_stripe_md *ea,
+ struct obd_trans_info *oti, struct obd_export *md_export,
+ void *capa)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ struct ptlrpc_request *req;
+ struct ost_body *body;
+ LIST_HEAD(cancels);
+ int rc, count;
+
+ if (!oa) {
+ CDEBUG(D_INFO, "oa NULL\n");
+ return -EINVAL;
+ }
+
+ count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+ LDLM_FL_DISCARD_DATA);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ return -ENOMEM;
+ }
+
+ osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa);
+ rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+ 0, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+ ptlrpc_at_set_req_timeout(req);
+
+ if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE)
+ oa->o_lcookie = *oti->oti_logcookies;
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+ LASSERT(body);
+ lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+ osc_pack_capa(req, body, (struct obd_capa *)capa);
+ ptlrpc_request_set_replen(req);
+
+ /* If osc_destroy is for destroying the unlink orphan,
+ * sent from MDT to OST, which should not be blocked here,
+ * because the process might be triggered by ptlrpcd, and
+ * it is not good to block ptlrpcd thread (b=16006)*/
+ if (!(oa->o_flags & OBD_FL_DELORPHAN)) {
+ req->rq_interpret_reply = osc_destroy_interpret;
+ if (!osc_can_send_destroy(cli)) {
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP,
+ NULL);
+
+ /*
+ * Wait until the number of on-going destroy RPCs drops
+ * under max_rpc_in_flight
+ */
+ l_wait_event_exclusive(cli->cl_destroy_waitq,
+ osc_can_send_destroy(cli), &lwi);
+ }
+ }
+
+ /* Do not wait for response */
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+ return 0;
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+ long writing_bytes)
+{
+ u32 bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
+
+ LASSERT(!(oa->o_valid & bits));
+
+ oa->o_valid |= bits;
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ oa->o_dirty = cli->cl_dirty;
+ if (unlikely(cli->cl_dirty - cli->cl_dirty_transit >
+ cli->cl_dirty_max)) {
+ CERROR("dirty %lu - %lu > dirty_max %lu\n",
+ cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
+ oa->o_undirty = 0;
+ } else if (unlikely(atomic_read(&obd_dirty_pages) -
+ atomic_read(&obd_dirty_transit_pages) >
+ (long)(obd_max_dirty_pages + 1))) {
+ /* The atomic_read() allowing the atomic_inc() are
+ * not covered by a lock thus they may safely race and trip
+ * this CERROR() unless we add in a small fudge factor (+1). */
+ CERROR("dirty %d - %d > system dirty_max %d\n",
+ atomic_read(&obd_dirty_pages),
+ atomic_read(&obd_dirty_transit_pages),
+ obd_max_dirty_pages);
+ oa->o_undirty = 0;
+ } else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) {
+ CERROR("dirty %lu - dirty_max %lu too big???\n",
+ cli->cl_dirty, cli->cl_dirty_max);
+ oa->o_undirty = 0;
+ } else {
+ long max_in_flight = (cli->cl_max_pages_per_rpc <<
+ PAGE_CACHE_SHIFT)*
+ (cli->cl_max_rpcs_in_flight + 1);
+ oa->o_undirty = max(cli->cl_dirty_max, max_in_flight);
+ }
+ oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+ oa->o_dropped = cli->cl_lost_grant;
+ cli->cl_lost_grant = 0;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n",
+ oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+ cli->cl_next_shrink_grant =
+ cfs_time_shift(cli->cl_grant_shrink_interval);
+ CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+ cli->cl_next_shrink_grant);
+}
+
+static void __osc_update_grant(struct client_obd *cli, u64 grant)
+{
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_avail_grant += grant;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+ if (body->oa.o_valid & OBD_MD_FLGRANT) {
+ CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant);
+ __osc_update_grant(cli, body->oa.o_grant);
+ }
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, u32 vallen,
+ void *val, struct ptlrpc_request_set *set);
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *aa, int rc)
+{
+ struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+ struct obdo *oa = ((struct osc_brw_async_args *)aa)->aa_oa;
+ struct ost_body *body;
+
+ if (rc != 0) {
+ __osc_update_grant(cli, oa->o_grant);
+ goto out;
+ }
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ LASSERT(body);
+ osc_update_grant(cli, body);
+out:
+ OBDO_FREE(oa);
+ return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ oa->o_grant = cli->cl_avail_grant / 4;
+ cli->cl_avail_grant -= oa->o_grant;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+ oa->o_valid |= OBD_MD_FLFLAGS;
+ oa->o_flags = 0;
+ }
+ oa->o_flags |= OBD_FL_SHRINK_GRANT;
+ osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC. This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+ __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+ (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ if (cli->cl_avail_grant <= target_bytes)
+ target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+ int rc = 0;
+ struct ost_body *body;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ /* Don't shrink if we are already above or below the desired limit
+ * We don't want to shrink below a single RPC, as that will negatively
+ * impact block allocation and long-term performance. */
+ if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT)
+ target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+ if (target_bytes >= cli->cl_avail_grant) {
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ return 0;
+ }
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ OBD_ALLOC_PTR(body);
+ if (!body)
+ return -ENOMEM;
+
+ osc_announce_cached(cli, &body->oa, 0);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+ cli->cl_avail_grant = target_bytes;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+ body->oa.o_valid |= OBD_MD_FLFLAGS;
+ body->oa.o_flags = 0;
+ }
+ body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+ osc_update_next_shrink(cli);
+
+ rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+ sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+ sizeof(*body), body, NULL);
+ if (rc != 0)
+ __osc_update_grant(cli, body->oa.o_grant);
+ OBD_FREE_PTR(body);
+ return rc;
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+ unsigned long time = cfs_time_current();
+ unsigned long next_shrink = client->cl_next_shrink_grant;
+
+ if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+ OBD_CONNECT_GRANT_SHRINK) == 0)
+ return 0;
+
+ if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+ /* Get the current RPC size directly, instead of going via:
+ * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+ * Keep comment here so that it can be found by searching. */
+ int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+ if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+ client->cl_avail_grant > brw_size)
+ return 1;
+ else
+ osc_update_next_shrink(client);
+ }
+ return 0;
+}
+
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+{
+ struct client_obd *client;
+
+ list_for_each_entry(client, &item->ti_obd_list,
+ cl_grant_shrink_list) {
+ if (osc_should_shrink_grant(client))
+ osc_shrink_grant(client);
+ }
+ return 0;
+}
+
+static int osc_add_shrink_grant(struct client_obd *client)
+{
+ int rc;
+
+ rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+ TIMEOUT_GRANT,
+ osc_grant_shrink_grant_cb, NULL,
+ &client->cl_grant_shrink_list);
+ if (rc) {
+ CERROR("add grant client %s error %d\n",
+ client->cl_import->imp_obd->obd_name, rc);
+ return rc;
+ }
+ CDEBUG(D_CACHE, "add grant client %s \n",
+ client->cl_import->imp_obd->obd_name);
+ osc_update_next_shrink(client);
+ return 0;
+}
+
+static int osc_del_shrink_grant(struct client_obd *client)
+{
+ return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+ TIMEOUT_GRANT);
+}
+
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+ /*
+ * ocd_grant is the total grant amount we're expect to hold: if we've
+ * been evicted, it's the new avail_grant amount, cl_dirty will drop
+ * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty.
+ *
+ * race is tolerable here: if we're evicted, but imp_state already
+ * left EVICTED state, then cl_dirty must be 0 already.
+ */
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED)
+ cli->cl_avail_grant = ocd->ocd_grant;
+ else
+ cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty;
+
+ if (cli->cl_avail_grant < 0) {
+ CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n",
+ cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant,
+ ocd->ocd_grant, cli->cl_dirty);
+ /* workaround for servers which do not have the patch from
+ * LU-2679 */
+ cli->cl_avail_grant = ocd->ocd_grant;
+ }
+
+ /* determine the appropriate chunk size used by osc_extent. */
+ cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld chunk bits: %d\n",
+ cli->cl_import->imp_obd->obd_name,
+ cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits);
+
+ if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+ list_empty(&cli->cl_grant_shrink_list))
+ osc_add_shrink_grant(cli);
+}
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, u32 page_count,
+ struct brw_page **pga)
+{
+ char *ptr;
+ int i = 0;
+
+ /* skip bytes read OK */
+ while (nob_read > 0) {
+ LASSERT (page_count > 0);
+
+ if (pga[i]->count > nob_read) {
+ /* EOF inside this page */
+ ptr = kmap(pga[i]->pg) +
+ (pga[i]->off & ~CFS_PAGE_MASK);
+ memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+ kunmap(pga[i]->pg);
+ page_count--;
+ i++;
+ break;
+ }
+
+ nob_read -= pga[i]->count;
+ page_count--;
+ i++;
+ }
+
+ /* zero remaining pages */
+ while (page_count-- > 0) {
+ ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK);
+ memset(ptr, 0, pga[i]->count);
+ kunmap(pga[i]->pg);
+ i++;
+ }
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+ int requested_nob, int niocount,
+ u32 page_count, struct brw_page **pga)
+{
+ int i;
+ __u32 *remote_rcs;
+
+ remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+ sizeof(*remote_rcs) *
+ niocount);
+ if (remote_rcs == NULL) {
+ CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+ return -EPROTO;
+ }
+
+ /* return error if any niobuf was in error */
+ for (i = 0; i < niocount; i++) {
+ if ((int)remote_rcs[i] < 0)
+ return remote_rcs[i];
+
+ if (remote_rcs[i] != 0) {
+ CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+ i, remote_rcs[i], req);
+ return -EPROTO;
+ }
+ }
+
+ if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+ CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+ req->rq_bulk->bd_nob_transferred, requested_nob);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+ if (p1->flag != p2->flag) {
+ unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
+ OBD_BRW_SYNC | OBD_BRW_ASYNC|OBD_BRW_NOQUOTA);
+
+ /* warn if we try to combine flags that we don't know to be
+ * safe to combine */
+ if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+ CWARN("Saw flags 0x%x and 0x%x in the same brw, please report this at http://bugs.whamcloud.com/\n",
+ p1->flag, p2->flag);
+ }
+ return 0;
+ }
+
+ return (p1->off + p1->count == p2->off);
+}
+
+static u32 osc_checksum_bulk(int nob, u32 pg_count,
+ struct brw_page **pga, int opc,
+ cksum_type_t cksum_type)
+{
+ __u32 cksum;
+ int i = 0;
+ struct cfs_crypto_hash_desc *hdesc;
+ unsigned int bufsize;
+ int err;
+ unsigned char cfs_alg = cksum_obd2cfs(cksum_type);
+
+ LASSERT(pg_count > 0);
+
+ hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+ if (IS_ERR(hdesc)) {
+ CERROR("Unable to initialize checksum hash %s\n",
+ cfs_crypto_hash_name(cfs_alg));
+ return PTR_ERR(hdesc);
+ }
+
+ while (nob > 0 && pg_count > 0) {
+ int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+ /* corrupt the data before we compute the checksum, to
+ * simulate an OST->client data error */
+ if (i == 0 && opc == OST_READ &&
+ OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+ unsigned char *ptr = kmap(pga[i]->pg);
+ int off = pga[i]->off & ~CFS_PAGE_MASK;
+ memcpy(ptr + off, "bad1", min(4, nob));
+ kunmap(pga[i]->pg);
+ }
+ cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+ pga[i]->off & ~CFS_PAGE_MASK,
+ count);
+ CDEBUG(D_PAGE,
+ "page %p map %p index %lu flags %lx count %u priv %0lx: off %d\n",
+ pga[i]->pg, pga[i]->pg->mapping, pga[i]->pg->index,
+ (long)pga[i]->pg->flags, page_count(pga[i]->pg),
+ page_private(pga[i]->pg),
+ (int)(pga[i]->off & ~CFS_PAGE_MASK));
+
+ nob -= pga[i]->count;
+ pg_count--;
+ i++;
+ }
+
+ bufsize = 4;
+ err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+ if (err)
+ cfs_crypto_hash_final(hdesc, NULL, NULL);
+
+ /* For sending we only compute the wrong checksum instead
+ * of corrupting the data so it is still correct on a redo */
+ if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+ cksum++;
+
+ return cksum;
+}
+
+static int osc_brw_prep_request(int cmd, struct client_obd *cli,
+ struct obdo *oa,
+ struct lov_stripe_md *lsm, u32 page_count,
+ struct brw_page **pga,
+ struct ptlrpc_request **reqp,
+ struct obd_capa *ocapa, int reserve,
+ int resend)
+{
+ struct ptlrpc_request *req;
+ struct ptlrpc_bulk_desc *desc;
+ struct ost_body *body;
+ struct obd_ioobj *ioobj;
+ struct niobuf_remote *niobuf;
+ int niocount, i, requested_nob, opc, rc;
+ struct osc_brw_async_args *aa;
+ struct req_capsule *pill;
+ struct brw_page *pg_prev;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+ return -ENOMEM; /* Recoverable */
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+ return -EINVAL; /* Fatal */
+
+ if ((cmd & OBD_BRW_WRITE) != 0) {
+ opc = OST_WRITE;
+ req = ptlrpc_request_alloc_pool(cli->cl_import,
+ cli->cl_import->imp_rq_pool,
+ &RQF_OST_BRW_WRITE);
+ } else {
+ opc = OST_READ;
+ req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+ }
+ if (req == NULL)
+ return -ENOMEM;
+
+ for (niocount = i = 1; i < page_count; i++) {
+ if (!can_merge_pages(pga[i - 1], pga[i]))
+ niocount++;
+ }
+
+ pill = &req->rq_pill;
+ req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+ sizeof(*ioobj));
+ req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+ niocount * sizeof(*niobuf));
+ osc_set_capa_size(req, &RMF_CAPA1, ocapa);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+ req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+ ptlrpc_at_set_req_timeout(req);
+ /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+ * retry logic */
+ req->rq_no_retry_einprogress = 1;
+
+ desc = ptlrpc_prep_bulk_imp(req, page_count,
+ cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+ opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK,
+ OST_BULK_PORTAL);
+
+ if (desc == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ /* NB request now owns desc and will free it when it gets freed */
+
+ body = req_capsule_client_get(pill, &RMF_OST_BODY);
+ ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+ niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+ LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+ lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+ obdo_to_ioobj(oa, ioobj);
+ ioobj->ioo_bufcnt = niocount;
+ /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+ * that might be send for this request. The actual number is decided
+ * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+ * "max - 1" for old client compatibility sending "0", and also so the
+ * the actual maximum is a power-of-two number, not one less. LU-1431 */
+ ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+ osc_pack_capa(req, body, ocapa);
+ LASSERT(page_count > 0);
+ pg_prev = pga[0];
+ for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+ struct brw_page *pg = pga[i];
+ int poff = pg->off & ~CFS_PAGE_MASK;
+
+ LASSERT(pg->count > 0);
+ /* make sure there is no gap in the middle of page array */
+ LASSERTF(page_count == 1 ||
+ (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) &&
+ ergo(i > 0 && i < page_count - 1,
+ poff == 0 && pg->count == PAGE_CACHE_SIZE) &&
+ ergo(i == page_count - 1, poff == 0)),
+ "i: %d/%d pg: %p off: %llu, count: %u\n",
+ i, page_count, pg, pg->off, pg->count);
+ LASSERTF(i == 0 || pg->off > pg_prev->off,
+ "i %d p_c %u pg %p [pri %lu ind %lu] off %llu prev_pg %p [pri %lu ind %lu] off %llu\n",
+ i, page_count,
+ pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+ pg_prev->pg, page_private(pg_prev->pg),
+ pg_prev->pg->index, pg_prev->off);
+ LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+ (pg->flag & OBD_BRW_SRVLOCK));
+
+ ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count);
+ requested_nob += pg->count;
+
+ if (i > 0 && can_merge_pages(pg_prev, pg)) {
+ niobuf--;
+ niobuf->len += pg->count;
+ } else {
+ niobuf->offset = pg->off;
+ niobuf->len = pg->count;
+ niobuf->flags = pg->flag;
+ }
+ pg_prev = pg;
+ }
+
+ LASSERTF((void *)(niobuf - niocount) ==
+ req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+ "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+ &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+ osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+ if (resend) {
+ if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+ body->oa.o_valid |= OBD_MD_FLFLAGS;
+ body->oa.o_flags = 0;
+ }
+ body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+ }
+
+ if (osc_should_shrink_grant(cli))
+ osc_shrink_grant_local(cli, &body->oa);
+
+ /* size[REQ_REC_OFF] still sizeof (*body) */
+ if (opc == OST_WRITE) {
+ if (cli->cl_checksum &&
+ !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+ /* store cl_cksum_type in a local variable since
+ * it can be changed via lprocfs */
+ cksum_type_t cksum_type = cli->cl_cksum_type;
+
+ if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+ oa->o_flags &= OBD_FL_LOCAL_MASK;
+ body->oa.o_flags = 0;
+ }
+ body->oa.o_flags |= cksum_type_pack(cksum_type);
+ body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+ body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+ page_count, pga,
+ OST_WRITE,
+ cksum_type);
+ CDEBUG(D_PAGE, "checksum at write origin: %x\n",
+ body->oa.o_cksum);
+ /* save this in 'oa', too, for later checking */
+ oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+ oa->o_flags |= cksum_type_pack(cksum_type);
+ } else {
+ /* clear out the checksum flag, in case this is a
+ * resend but cl_checksum is no longer set. b=11238 */
+ oa->o_valid &= ~OBD_MD_FLCKSUM;
+ }
+ oa->o_cksum = body->oa.o_cksum;
+ /* 1 RC per niobuf */
+ req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+ sizeof(__u32) * niocount);
+ } else {
+ if (cli->cl_checksum &&
+ !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+ if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+ body->oa.o_flags = 0;
+ body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+ body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+ }
+ }
+ ptlrpc_request_set_replen(req);
+
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ aa->aa_oa = oa;
+ aa->aa_requested_nob = requested_nob;
+ aa->aa_nio_count = niocount;
+ aa->aa_page_count = page_count;
+ aa->aa_resends = 0;
+ aa->aa_ppga = pga;
+ aa->aa_cli = cli;
+ INIT_LIST_HEAD(&aa->aa_oaps);
+ if (ocapa && reserve)
+ aa->aa_ocapa = capa_get(ocapa);
+
+ *reqp = req;
+ return 0;
+
+ out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+ __u32 client_cksum, __u32 server_cksum, int nob,
+ u32 page_count, struct brw_page **pga,
+ cksum_type_t client_cksum_type)
+{
+ __u32 new_cksum;
+ char *msg;
+ cksum_type_t cksum_type;
+
+ if (server_cksum == client_cksum) {
+ CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+ return 0;
+ }
+
+ cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+ oa->o_flags : 0);
+ new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
+ cksum_type);
+
+ if (cksum_type != client_cksum_type)
+ msg = "the server did not use the checksum type specified in the original request - likely a protocol problem"
+ ;
+ else if (new_cksum == server_cksum)
+ msg = "changed on the client after we checksummed it - likely false positive due to mmap IO (bug 11742)"
+ ;
+ else if (new_cksum == client_cksum)
+ msg = "changed in transit before arrival at OST";
+ else
+ msg = "changed in transit AND doesn't match the original - likely false positive due to mmap IO (bug 11742)"
+ ;
+
+ LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID
+ " object "DOSTID" extent [%llu-%llu]\n",
+ msg, libcfs_nid2str(peer->nid),
+ oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+ oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+ oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+ POSTID(&oa->o_oi), pga[0]->off,
+ pga[page_count-1]->off + pga[page_count-1]->count - 1);
+ CERROR("original client csum %x (type %x), server csum %x (type %x), client csum now %x\n",
+ client_cksum, client_cksum_type,
+ server_cksum, cksum_type, new_cksum);
+ return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+ struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+ const lnet_process_id_t *peer =
+ &req->rq_import->imp_connection->c_peer;
+ struct client_obd *cli = aa->aa_cli;
+ struct ost_body *body;
+ __u32 client_cksum = 0;
+
+ if (rc < 0 && rc != -EDQUOT) {
+ DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
+ return rc;
+ }
+
+ LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+ body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+ if (body == NULL) {
+ DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
+ return -EPROTO;
+ }
+
+ /* set/clear over quota flag for a uid/gid */
+ if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+ body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
+ unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
+
+ CDEBUG(D_QUOTA, "setdq for [%u %u] with valid %#llx, flags %x\n",
+ body->oa.o_uid, body->oa.o_gid, body->oa.o_valid,
+ body->oa.o_flags);
+ osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags);
+ }
+
+ osc_update_grant(cli, body);
+
+ if (rc < 0)
+ return rc;
+
+ if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+ client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+ if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+ if (rc > 0) {
+ CERROR("Unexpected +ve rc %d\n", rc);
+ return -EPROTO;
+ }
+ LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
+
+ if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+ return -EAGAIN;
+
+ if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+ check_write_checksum(&body->oa, peer, client_cksum,
+ body->oa.o_cksum, aa->aa_requested_nob,
+ aa->aa_page_count, aa->aa_ppga,
+ cksum_type_unpack(aa->aa_oa->o_flags)))
+ return -EAGAIN;
+
+ rc = check_write_rcs(req, aa->aa_requested_nob,
+ aa->aa_nio_count,
+ aa->aa_page_count, aa->aa_ppga);
+ goto out;
+ }
+
+ /* The rest of this function executes only for OST_READs */
+
+ /* if unwrap_bulk failed, return -EAGAIN to retry */
+ rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+ if (rc < 0) {
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ if (rc > aa->aa_requested_nob) {
+ CERROR("Unexpected rc %d (%d requested)\n", rc,
+ aa->aa_requested_nob);
+ return -EPROTO;
+ }
+
+ if (rc != req->rq_bulk->bd_nob_transferred) {
+ CERROR ("Unexpected rc %d (%d transferred)\n",
+ rc, req->rq_bulk->bd_nob_transferred);
+ return -EPROTO;
+ }
+
+ if (rc < aa->aa_requested_nob)
+ handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+ if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+ static int cksum_counter;
+ __u32 server_cksum = body->oa.o_cksum;
+ char *via;
+ char *router;
+ cksum_type_t cksum_type;
+
+ cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+ body->oa.o_flags : 0);
+ client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+ aa->aa_ppga, OST_READ,
+ cksum_type);
+
+ if (peer->nid == req->rq_bulk->bd_sender) {
+ via = router = "";
+ } else {
+ via = " via ";
+ router = libcfs_nid2str(req->rq_bulk->bd_sender);
+ }
+
+ if (server_cksum != client_cksum) {
+ LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from %s%s%s inode " DFID " object " DOSTID " extent [%llu-%llu]\n",
+ req->rq_import->imp_obd->obd_name,
+ libcfs_nid2str(peer->nid),
+ via, router,
+ body->oa.o_valid & OBD_MD_FLFID ?
+ body->oa.o_parent_seq : (__u64)0,
+ body->oa.o_valid & OBD_MD_FLFID ?
+ body->oa.o_parent_oid : 0,
+ body->oa.o_valid & OBD_MD_FLFID ?
+ body->oa.o_parent_ver : 0,
+ POSTID(&body->oa.o_oi),
+ aa->aa_ppga[0]->off,
+ aa->aa_ppga[aa->aa_page_count-1]->off +
+ aa->aa_ppga[aa->aa_page_count-1]->count -
+ 1);
+ CERROR("client %x, server %x, cksum_type %x\n",
+ client_cksum, server_cksum, cksum_type);
+ cksum_counter = 0;
+ aa->aa_oa->o_cksum = client_cksum;
+ rc = -EAGAIN;
+ } else {
+ cksum_counter++;
+ CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+ rc = 0;
+ }
+ } else if (unlikely(client_cksum)) {
+ static int cksum_missed;
+
+ cksum_missed++;
+ if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+ CERROR("Checksum %u requested from %s but not sent\n",
+ cksum_missed, libcfs_nid2str(peer->nid));
+ } else {
+ rc = 0;
+ }
+out:
+ if (rc >= 0)
+ lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+ aa->aa_oa, &body->oa);
+
+ return rc;
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+ struct osc_brw_async_args *aa, int rc)
+{
+ struct ptlrpc_request *new_req;
+ struct osc_brw_async_args *new_aa;
+ struct osc_async_page *oap;
+
+ DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+ "redo for recoverable error %d", rc);
+
+ rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+ OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ,
+ aa->aa_cli, aa->aa_oa,
+ NULL /* lsm unused by osc currently */,
+ aa->aa_page_count, aa->aa_ppga,
+ &new_req, aa->aa_ocapa, 0, 1);
+ if (rc)
+ return rc;
+
+ list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+ if (oap->oap_request != NULL) {
+ LASSERTF(request == oap->oap_request,
+ "request %p != oap_request %p\n",
+ request, oap->oap_request);
+ if (oap->oap_interrupted) {
+ ptlrpc_req_finished(new_req);
+ return -EINTR;
+ }
+ }
+ }
+ /* New request takes over pga and oaps from old request.
+ * Note that copying a list_head doesn't work, need to move it... */
+ aa->aa_resends++;
+ new_req->rq_interpret_reply = request->rq_interpret_reply;
+ new_req->rq_async_args = request->rq_async_args;
+ /* cap resend delay to the current request timeout, this is similar to
+ * what ptlrpc does (see after_reply()) */
+ if (aa->aa_resends > new_req->rq_timeout)
+ new_req->rq_sent = get_seconds() + new_req->rq_timeout;
+ else
+ new_req->rq_sent = get_seconds() + aa->aa_resends;
+ new_req->rq_generation_set = 1;
+ new_req->rq_import_generation = request->rq_import_generation;
+
+ new_aa = ptlrpc_req_async_args(new_req);
+
+ INIT_LIST_HEAD(&new_aa->aa_oaps);
+ list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+ INIT_LIST_HEAD(&new_aa->aa_exts);
+ list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+ new_aa->aa_resends = aa->aa_resends;
+
+ list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+ if (oap->oap_request) {
+ ptlrpc_req_finished(oap->oap_request);
+ oap->oap_request = ptlrpc_request_addref(new_req);
+ }
+ }
+
+ new_aa->aa_ocapa = aa->aa_ocapa;
+ aa->aa_ocapa = NULL;
+
+ /* XXX: This code will run into problem if we're going to support
+ * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+ * and wait for all of them to be finished. We should inherit request
+ * set from old request. */
+ ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1);
+
+ DEBUG_REQ(D_INFO, new_req, "new request");
+ return 0;
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order. we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation. its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+ int stride, i, j;
+ struct brw_page *tmp;
+
+ if (num == 1)
+ return;
+ for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+ ;
+
+ do {
+ stride /= 3;
+ for (i = stride ; i < num ; i++) {
+ tmp = array[i];
+ j = i;
+ while (j >= stride && array[j - stride]->off > tmp->off) {
+ array[j] = array[j - stride];
+ j -= stride;
+ }
+ array[j] = tmp;
+ }
+ } while (stride > 1);
+}
+
+static void osc_release_ppga(struct brw_page **ppga, u32 count)
+{
+ LASSERT(ppga != NULL);
+ OBD_FREE(ppga, sizeof(*ppga) * count);
+}
+
+static int brw_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req, void *data, int rc)
+{
+ struct osc_brw_async_args *aa = data;
+ struct osc_extent *ext;
+ struct osc_extent *tmp;
+ struct cl_object *obj = NULL;
+ struct client_obd *cli = aa->aa_cli;
+
+ rc = osc_brw_fini_request(req, rc);
+ CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+ /* When server return -EINPROGRESS, client should always retry
+ * regardless of the number of times the bulk was resent already. */
+ if (osc_recoverable_error(rc)) {
+ if (req->rq_import_generation !=
+ req->rq_import->imp_generation) {
+ CDEBUG(D_HA, "%s: resend cross eviction for object: " DOSTID ", rc = %d.\n",
+ req->rq_import->imp_obd->obd_name,
+ POSTID(&aa->aa_oa->o_oi), rc);
+ } else if (rc == -EINPROGRESS ||
+ client_should_resend(aa->aa_resends, aa->aa_cli)) {
+ rc = osc_brw_redo_request(req, aa, rc);
+ } else {
+ CERROR("%s: too many resent retries for object: %llu:%llu, rc = %d.\n",
+ req->rq_import->imp_obd->obd_name,
+ POSTID(&aa->aa_oa->o_oi), rc);
+ }
+
+ if (rc == 0)
+ return 0;
+ else if (rc == -EAGAIN || rc == -EINPROGRESS)
+ rc = -EIO;
+ }
+
+ if (aa->aa_ocapa) {
+ capa_put(aa->aa_ocapa);
+ aa->aa_ocapa = NULL;
+ }
+
+ list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+ if (obj == NULL && rc == 0) {
+ obj = osc2cl(ext->oe_obj);
+ cl_object_get(obj);
+ }
+
+ list_del_init(&ext->oe_link);
+ osc_extent_finish(env, ext, 1, rc);
+ }
+ LASSERT(list_empty(&aa->aa_exts));
+ LASSERT(list_empty(&aa->aa_oaps));
+
+ if (obj != NULL) {
+ struct obdo *oa = aa->aa_oa;
+ struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+ unsigned long valid = 0;
+
+ LASSERT(rc == 0);
+ if (oa->o_valid & OBD_MD_FLBLOCKS) {
+ attr->cat_blocks = oa->o_blocks;
+ valid |= CAT_BLOCKS;
+ }
+ if (oa->o_valid & OBD_MD_FLMTIME) {
+ attr->cat_mtime = oa->o_mtime;
+ valid |= CAT_MTIME;
+ }
+ if (oa->o_valid & OBD_MD_FLATIME) {
+ attr->cat_atime = oa->o_atime;
+ valid |= CAT_ATIME;
+ }
+ if (oa->o_valid & OBD_MD_FLCTIME) {
+ attr->cat_ctime = oa->o_ctime;
+ valid |= CAT_CTIME;
+ }
+ if (valid != 0) {
+ cl_object_attr_lock(obj);
+ cl_object_attr_set(env, obj, attr, valid);
+ cl_object_attr_unlock(obj);
+ }
+ cl_object_put(env, obj);
+ }
+ OBDO_FREE(aa->aa_oa);
+
+ cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc :
+ req->rq_bulk->bd_nob_transferred);
+ osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+ ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+ * is called so we know whether to go to sync BRWs or wait for more
+ * RPCs to complete */
+ if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+ cli->cl_w_in_flight--;
+ else
+ cli->cl_r_in_flight--;
+ osc_wake_cache_waiters(cli);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+ return rc;
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+ struct list_head *ext_list, int cmd, pdl_policy_t pol)
+{
+ struct ptlrpc_request *req = NULL;
+ struct osc_extent *ext;
+ struct brw_page **pga = NULL;
+ struct osc_brw_async_args *aa = NULL;
+ struct obdo *oa = NULL;
+ struct osc_async_page *oap;
+ struct osc_async_page *tmp;
+ struct cl_req *clerq = NULL;
+ enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE :
+ CRT_READ;
+ struct ldlm_lock *lock = NULL;
+ struct cl_req_attr *crattr = NULL;
+ u64 starting_offset = OBD_OBJECT_EOF;
+ u64 ending_offset = 0;
+ int mpflag = 0;
+ int mem_tight = 0;
+ int page_count = 0;
+ int i;
+ int rc;
+ struct ost_body *body;
+ LIST_HEAD(rpc_list);
+
+ LASSERT(!list_empty(ext_list));
+
+ /* add pages into rpc_list to build BRW rpc */
+ list_for_each_entry(ext, ext_list, oe_link) {
+ LASSERT(ext->oe_state == OES_RPC);
+ mem_tight |= ext->oe_memalloc;
+ list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+ ++page_count;
+ list_add_tail(&oap->oap_rpc_item, &rpc_list);
+ if (starting_offset > oap->oap_obj_off)
+ starting_offset = oap->oap_obj_off;
+ else
+ LASSERT(oap->oap_page_off == 0);
+ if (ending_offset < oap->oap_obj_off + oap->oap_count)
+ ending_offset = oap->oap_obj_off +
+ oap->oap_count;
+ else
+ LASSERT(oap->oap_page_off + oap->oap_count ==
+ PAGE_CACHE_SIZE);
+ }
+ }
+
+ if (mem_tight)
+ mpflag = cfs_memory_pressure_get_and_set();
+
+ OBD_ALLOC(crattr, sizeof(*crattr));
+ if (crattr == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ OBD_ALLOC(pga, sizeof(*pga) * page_count);
+ if (pga == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ OBDO_ALLOC(oa);
+ if (oa == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ i = 0;
+ list_for_each_entry(oap, &rpc_list, oap_rpc_item) {
+ struct cl_page *page = oap2cl_page(oap);
+ if (clerq == NULL) {
+ clerq = cl_req_alloc(env, page, crt,
+ 1 /* only 1-object rpcs for now */);
+ if (IS_ERR(clerq)) {
+ rc = PTR_ERR(clerq);
+ goto out;
+ }
+ lock = oap->oap_ldlm_lock;
+ }
+ if (mem_tight)
+ oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+ pga[i] = &oap->oap_brw_page;
+ pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+ CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
+ pga[i]->pg, page_index(oap->oap_page), oap,
+ pga[i]->flag);
+ i++;
+ cl_req_page_add(env, clerq, page);
+ }
+
+ /* always get the data for the obdo for the rpc */
+ LASSERT(clerq != NULL);
+ crattr->cra_oa = oa;
+ cl_req_attr_set(env, clerq, crattr, ~0ULL);
+ if (lock) {
+ oa->o_handle = lock->l_remote_handle;
+ oa->o_valid |= OBD_MD_FLHANDLE;
+ }
+
+ rc = cl_req_prep(env, clerq);
+ if (rc != 0) {
+ CERROR("cl_req_prep failed: %d\n", rc);
+ goto out;
+ }
+
+ sort_brw_pages(pga, page_count);
+ rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count,
+ pga, &req, crattr->cra_capa, 1, 0);
+ if (rc != 0) {
+ CERROR("prep_req failed: %d\n", rc);
+ goto out;
+ }
+
+ req->rq_interpret_reply = brw_interpret;
+
+ if (mem_tight != 0)
+ req->rq_memalloc = 1;
+
+ /* Need to update the timestamps after the request is built in case
+ * we race with setattr (locally or in queue at OST). If OST gets
+ * later setattr before earlier BRW (as determined by the request xid),
+ * the OST will not use BRW timestamps. Sadly, there is no obvious
+ * way to do this in a single call. bug 10150 */
+ body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+ crattr->cra_oa = &body->oa;
+ cl_req_attr_set(env, clerq, crattr,
+ OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME);
+
+ lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
+
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ INIT_LIST_HEAD(&aa->aa_oaps);
+ list_splice_init(&rpc_list, &aa->aa_oaps);
+ INIT_LIST_HEAD(&aa->aa_exts);
+ list_splice_init(ext_list, &aa->aa_exts);
+ aa->aa_clerq = clerq;
+
+ /* queued sync pages can be torn down while the pages
+ * were between the pending list and the rpc */
+ tmp = NULL;
+ list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+ /* only one oap gets a request reference */
+ if (tmp == NULL)
+ tmp = oap;
+ if (oap->oap_interrupted && !req->rq_intr) {
+ CDEBUG(D_INODE, "oap %p in req %p interrupted\n",
+ oap, req);
+ ptlrpc_mark_interrupted(req);
+ }
+ }
+ if (tmp != NULL)
+ tmp->oap_request = ptlrpc_request_addref(req);
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ starting_offset >>= PAGE_CACHE_SHIFT;
+ if (cmd == OBD_BRW_READ) {
+ cli->cl_r_in_flight++;
+ lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+ lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+ lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+ starting_offset + 1);
+ } else {
+ cli->cl_w_in_flight++;
+ lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+ lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+ lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+ starting_offset + 1);
+ }
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
+ page_count, aa, cli->cl_r_in_flight,
+ cli->cl_w_in_flight);
+
+ /* XXX: Maybe the caller can check the RPC bulk descriptor to
+ * see which CPU/NUMA node the majority of pages were allocated
+ * on, and try to assign the async RPC to the CPU core
+ * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic.
+ *
+ * But on the other hand, we expect that multiple ptlrpcd
+ * threads and the initial write sponsor can run in parallel,
+ * especially when data checksum is enabled, which is CPU-bound
+ * operation and single ptlrpcd thread cannot process in time.
+ * So more ptlrpcd threads sharing BRW load
+ * (with PDL_POLICY_ROUND) seems better.
+ */
+ ptlrpcd_add_req(req, pol, -1);
+ rc = 0;
+
+out:
+ if (mem_tight != 0)
+ cfs_memory_pressure_restore(mpflag);
+
+ if (crattr != NULL) {
+ capa_put(crattr->cra_capa);
+ OBD_FREE(crattr, sizeof(*crattr));
+ }
+
+ if (rc != 0) {
+ LASSERT(req == NULL);
+
+ if (oa)
+ OBDO_FREE(oa);
+ if (pga)
+ OBD_FREE(pga, sizeof(*pga) * page_count);
+ /* this should happen rarely and is pretty bad, it makes the
+ * pending list not follow the dirty order */
+ while (!list_empty(ext_list)) {
+ ext = list_entry(ext_list->next, struct osc_extent,
+ oe_link);
+ list_del_init(&ext->oe_link);
+ osc_extent_finish(env, ext, 0, rc);
+ }
+ if (clerq && !IS_ERR(clerq))
+ cl_req_completion(env, clerq, rc);
+ }
+ return rc;
+}
+
+static int osc_set_lock_data_with_check(struct ldlm_lock *lock,
+ struct ldlm_enqueue_info *einfo)
+{
+ void *data = einfo->ei_cbdata;
+ int set = 0;
+
+ LASSERT(lock != NULL);
+ LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl);
+ LASSERT(lock->l_resource->lr_type == einfo->ei_type);
+ LASSERT(lock->l_completion_ast == einfo->ei_cb_cp);
+ LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl);
+
+ lock_res_and_lock(lock);
+ spin_lock(&osc_ast_guard);
+
+ if (lock->l_ast_data == NULL)
+ lock->l_ast_data = data;
+ if (lock->l_ast_data == data)
+ set = 1;
+
+ spin_unlock(&osc_ast_guard);
+ unlock_res_and_lock(lock);
+
+ return set;
+}
+
+static int osc_set_data_with_check(struct lustre_handle *lockh,
+ struct ldlm_enqueue_info *einfo)
+{
+ struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+ int set = 0;
+
+ if (lock != NULL) {
+ set = osc_set_lock_data_with_check(lock, einfo);
+ LDLM_LOCK_PUT(lock);
+ } else
+ CERROR("lockh %p, data %p - client evicted?\n",
+ lockh, einfo->ei_cbdata);
+ return set;
+}
+
+/* find any ldlm lock of the inode in osc
+ * return 0 not find
+ * 1 find one
+ * < 0 error */
+static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+ ldlm_iterator_t replace, void *data)
+{
+ struct ldlm_res_id res_id;
+ struct obd_device *obd = class_exp2obd(exp);
+ int rc = 0;
+
+ ostid_build_res_name(&lsm->lsm_oi, &res_id);
+ rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+ if (rc == LDLM_ITER_STOP)
+ return 1;
+ if (rc == LDLM_ITER_CONTINUE)
+ return 0;
+ return rc;
+}
+
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb,
+ obd_enqueue_update_f upcall, void *cookie,
+ __u64 *flags, int agl, int rc)
+{
+ int intent = *flags & LDLM_FL_HAS_INTENT;
+
+ if (intent) {
+ /* The request was created before ldlm_cli_enqueue call. */
+ if (rc == ELDLM_LOCK_ABORTED) {
+ struct ldlm_reply *rep;
+ rep = req_capsule_server_get(&req->rq_pill,
+ &RMF_DLM_REP);
+
+ LASSERT(rep != NULL);
+ rep->lock_policy_res1 =
+ ptlrpc_status_ntoh(rep->lock_policy_res1);
+ if (rep->lock_policy_res1)
+ rc = rep->lock_policy_res1;
+ }
+ }
+
+ if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) ||
+ (rc == 0)) {
+ *flags |= LDLM_FL_LVB_READY;
+ CDEBUG(D_INODE, "got kms %llu blocks %llu mtime %llu\n",
+ lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime);
+ }
+
+ /* Call the update callback. */
+ rc = (*upcall)(cookie, rc);
+ return rc;
+}
+
+static int osc_enqueue_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ struct osc_enqueue_args *aa, int rc)
+{
+ struct ldlm_lock *lock;
+ struct lustre_handle handle;
+ __u32 mode;
+ struct ost_lvb *lvb;
+ __u32 lvb_len;
+ __u64 *flags = aa->oa_flags;
+
+ /* Make a local copy of a lock handle and a mode, because aa->oa_*
+ * might be freed anytime after lock upcall has been called. */
+ lustre_handle_copy(&handle, aa->oa_lockh);
+ mode = aa->oa_ei->ei_mode;
+
+ /* ldlm_cli_enqueue is holding a reference on the lock, so it must
+ * be valid. */
+ lock = ldlm_handle2lock(&handle);
+
+ /* Take an additional reference so that a blocking AST that
+ * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+ * to arrive after an upcall has been executed by
+ * osc_enqueue_fini(). */
+ ldlm_lock_addref(&handle, mode);
+
+ /* Let CP AST to grant the lock first. */
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+ if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) {
+ lvb = NULL;
+ lvb_len = 0;
+ } else {
+ lvb = aa->oa_lvb;
+ lvb_len = sizeof(*aa->oa_lvb);
+ }
+
+ /* Complete obtaining the lock procedure. */
+ rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
+ mode, flags, lvb, lvb_len, &handle, rc);
+ /* Complete osc stuff. */
+ rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie,
+ flags, aa->oa_agl, rc);
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+ /* Release the lock for async request. */
+ if (lustre_handle_is_used(&handle) && rc == ELDLM_OK)
+ /*
+ * Releases a reference taken by ldlm_cli_enqueue(), if it is
+ * not already released by
+ * ldlm_cli_enqueue_fini()->failed_lock_cleanup()
+ */
+ ldlm_lock_decref(&handle, mode);
+
+ LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
+ aa->oa_lockh, req, aa);
+ ldlm_lock_decref(&handle, mode);
+ LDLM_LOCK_PUT(lock);
+ return rc;
+}
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+ __u64 *flags, ldlm_policy_data_t *policy,
+ struct ost_lvb *lvb, int kms_valid,
+ obd_enqueue_update_f upcall, void *cookie,
+ struct ldlm_enqueue_info *einfo,
+ struct lustre_handle *lockh,
+ struct ptlrpc_request_set *rqset, int async, int agl)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct ptlrpc_request *req = NULL;
+ int intent = *flags & LDLM_FL_HAS_INTENT;
+ __u64 match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY);
+ ldlm_mode_t mode;
+ int rc;
+
+ /* Filesystem lock extents are extended to page boundaries so that
+ * dealing with the page cache is a little smoother. */
+ policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+ policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+ /*
+ * kms is not valid when either object is completely fresh (so that no
+ * locks are cached), or object was evicted. In the latter case cached
+ * lock cannot be used, because it would prime inode state with
+ * potentially stale LVB.
+ */
+ if (!kms_valid)
+ goto no_match;
+
+ /* Next, search for already existing extent locks that will cover us */
+ /* If we're trying to read, we also search for an existing PW lock. The
+ * VFS and page cache already protect us locally, so lots of readers/
+ * writers can share a single PW lock.
+ *
+ * There are problems with conversion deadlocks, so instead of
+ * converting a read lock to a write lock, we'll just enqueue a new
+ * one.
+ *
+ * At some point we should cancel the read lock instead of making them
+ * send us a blocking callback, but there are problems with canceling
+ * locks out from other users right now, too. */
+ mode = einfo->ei_mode;
+ if (einfo->ei_mode == LCK_PR)
+ mode |= LCK_PW;
+ mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id,
+ einfo->ei_type, policy, mode, lockh, 0);
+ if (mode) {
+ struct ldlm_lock *matched = ldlm_handle2lock(lockh);
+
+ if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) {
+ /* For AGL, if enqueue RPC is sent but the lock is not
+ * granted, then skip to process this strpe.
+ * Return -ECANCELED to tell the caller. */
+ ldlm_lock_decref(lockh, mode);
+ LDLM_LOCK_PUT(matched);
+ return -ECANCELED;
+ } else if (osc_set_lock_data_with_check(matched, einfo)) {
+ *flags |= LDLM_FL_LVB_READY;
+ /* addref the lock only if not async requests and PW
+ * lock is matched whereas we asked for PR. */
+ if (!rqset && einfo->ei_mode != mode)
+ ldlm_lock_addref(lockh, LCK_PR);
+ if (intent) {
+ /* I would like to be able to ASSERT here that
+ * rss <= kms, but I can't, for reasons which
+ * are explained in lov_enqueue() */
+ }
+
+ /* We already have a lock, and it's referenced.
+ *
+ * At this point, the cl_lock::cll_state is CLS_QUEUING,
+ * AGL upcall may change it to CLS_HELD directly. */
+ (*upcall)(cookie, ELDLM_OK);
+
+ if (einfo->ei_mode != mode)
+ ldlm_lock_decref(lockh, LCK_PW);
+ else if (rqset)
+ /* For async requests, decref the lock. */
+ ldlm_lock_decref(lockh, einfo->ei_mode);
+ LDLM_LOCK_PUT(matched);
+ return ELDLM_OK;
+ } else {
+ ldlm_lock_decref(lockh, mode);
+ LDLM_LOCK_PUT(matched);
+ }
+ }
+
+ no_match:
+ if (intent) {
+ LIST_HEAD(cancels);
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_LDLM_ENQUEUE_LVB);
+ if (req == NULL)
+ return -ENOMEM;
+
+ rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+ sizeof(*lvb));
+ ptlrpc_request_set_replen(req);
+ }
+
+ /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+ *flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+ rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+ sizeof(*lvb), LVB_T_OST, lockh, async);
+ if (rqset) {
+ if (!rc) {
+ struct osc_enqueue_args *aa;
+ CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ aa->oa_ei = einfo;
+ aa->oa_exp = exp;
+ aa->oa_flags = flags;
+ aa->oa_upcall = upcall;
+ aa->oa_cookie = cookie;
+ aa->oa_lvb = lvb;
+ aa->oa_lockh = lockh;
+ aa->oa_agl = !!agl;
+
+ req->rq_interpret_reply =
+ (ptlrpc_interpterer_t)osc_enqueue_interpret;
+ if (rqset == PTLRPCD_SET)
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+ else
+ ptlrpc_set_add_req(rqset, req);
+ } else if (intent) {
+ ptlrpc_req_finished(req);
+ }
+ return rc;
+ }
+
+ rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc);
+ if (intent)
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+ __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+ __u64 *flags, void *data, struct lustre_handle *lockh,
+ int unref)
+{
+ struct obd_device *obd = exp->exp_obd;
+ __u64 lflags = *flags;
+ ldlm_mode_t rc;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+ return -EIO;
+
+ /* Filesystem lock extents are extended to page boundaries so that
+ * dealing with the page cache is a little smoother */
+ policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+ policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+ /* Next, search for already existing extent locks that will cover us */
+ /* If we're trying to read, we also search for an existing PW lock. The
+ * VFS and page cache already protect us locally, so lots of readers/
+ * writers can share a single PW lock. */
+ rc = mode;
+ if (mode == LCK_PR)
+ rc |= LCK_PW;
+ rc = ldlm_lock_match(obd->obd_namespace, lflags,
+ res_id, type, policy, rc, lockh, unref);
+ if (rc) {
+ if (data != NULL) {
+ if (!osc_set_data_with_check(lockh, data)) {
+ if (!(lflags & LDLM_FL_TEST_LOCK))
+ ldlm_lock_decref(lockh, rc);
+ return 0;
+ }
+ }
+ if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) {
+ ldlm_lock_addref(lockh, LCK_PR);
+ ldlm_lock_decref(lockh, LCK_PW);
+ }
+ return rc;
+ }
+ return rc;
+}
+
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode)
+{
+ if (unlikely(mode == LCK_GROUP))
+ ldlm_lock_decref_and_cancel(lockh, mode);
+ else
+ ldlm_lock_decref(lockh, mode);
+
+ return 0;
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ struct osc_async_args *aa, int rc)
+{
+ struct obd_statfs *msfs;
+
+ if (rc == -EBADR)
+ /* The request has in fact never been sent
+ * due to issues at a higher level (LOV).
+ * Exit immediately since the caller is
+ * aware of the problem and takes care
+ * of the clean up */
+ return rc;
+
+ if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+ (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) {
+ rc = 0;
+ goto out;
+ }
+
+ if (rc != 0)
+ goto out;
+
+ msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+ if (msfs == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *aa->aa_oi->oi_osfs = *msfs;
+out:
+ rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+ return rc;
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+ struct obd_info *oinfo, __u64 max_age,
+ struct ptlrpc_request_set *rqset)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct ptlrpc_request *req;
+ struct osc_async_args *aa;
+ int rc;
+
+ /* We could possibly pass max_age in the request (as an absolute
+ * timestamp or a "seconds.usec ago") so the target can avoid doing
+ * extra calls into the filesystem if that isn't necessary (e.g.
+ * during mount that would help a bit). Having relative timestamps
+ * is not so great if request processing is slow, while absolute
+ * timestamps are not ideal because they need time synchronization. */
+ req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+ if (req == NULL)
+ return -ENOMEM;
+
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+ ptlrpc_request_set_replen(req);
+ req->rq_request_portal = OST_CREATE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
+ if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+ /* procfs requests not want stat in wait for avoid deadlock */
+ req->rq_no_resend = 1;
+ req->rq_no_delay = 1;
+ }
+
+ req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+ CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ aa->aa_oi = oinfo;
+
+ ptlrpc_set_add_req(rqset, req);
+ return 0;
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+ struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct obd_statfs *msfs;
+ struct ptlrpc_request *req;
+ struct obd_import *imp = NULL;
+ int rc;
+
+ /*Since the request might also come from lprocfs, so we need
+ *sync this with client_disconnect_export Bug15684*/
+ down_read(&obd->u.cli.cl_sem);
+ if (obd->u.cli.cl_import)
+ imp = class_import_get(obd->u.cli.cl_import);
+ up_read(&obd->u.cli.cl_sem);
+ if (!imp)
+ return -ENODEV;
+
+ /* We could possibly pass max_age in the request (as an absolute
+ * timestamp or a "seconds.usec ago") so the target can avoid doing
+ * extra calls into the filesystem if that isn't necessary (e.g.
+ * during mount that would help a bit). Having relative timestamps
+ * is not so great if request processing is slow, while absolute
+ * timestamps are not ideal because they need time synchronization. */
+ req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+ class_import_put(imp);
+
+ if (req == NULL)
+ return -ENOMEM;
+
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+ ptlrpc_request_set_replen(req);
+ req->rq_request_portal = OST_CREATE_PORTAL;
+ ptlrpc_at_set_req_timeout(req);
+
+ if (flags & OBD_STATFS_NODELAY) {
+ /* procfs requests not want stat in wait for avoid deadlock */
+ req->rq_no_resend = 1;
+ req->rq_no_delay = 1;
+ }
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+ if (msfs == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *osfs = *msfs;
+
+ out:
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ */
+static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
+{
+ /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+ struct lov_user_md_v3 lum, *lumk;
+ struct lov_user_ost_data_v1 *lmm_objects;
+ int rc = 0, lum_size;
+
+ if (!lsm)
+ return -ENODATA;
+
+ /* we only need the header part from user space to get lmm_magic and
+ * lmm_stripe_count, (the header part is common to v1 and v3) */
+ lum_size = sizeof(struct lov_user_md_v1);
+ if (copy_from_user(&lum, lump, lum_size))
+ return -EFAULT;
+
+ if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+ (lum.lmm_magic != LOV_USER_MAGIC_V3))
+ return -EINVAL;
+
+ /* lov_user_md_vX and lov_mds_md_vX must have the same size */
+ LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+ LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+ LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+ /* we can use lov_mds_md_size() to compute lum_size
+ * because lov_user_md_vX and lov_mds_md_vX have the same size */
+ if (lum.lmm_stripe_count > 0) {
+ lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
+ OBD_ALLOC(lumk, lum_size);
+ if (!lumk)
+ return -ENOMEM;
+
+ if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+ lmm_objects =
+ &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+ else
+ lmm_objects = &(lumk->lmm_objects[0]);
+ lmm_objects->l_ost_oi = lsm->lsm_oi;
+ } else {
+ lum_size = lov_mds_md_size(0, lum.lmm_magic);
+ lumk = &lum;
+ }
+
+ lumk->lmm_oi = lsm->lsm_oi;
+ lumk->lmm_stripe_count = 1;
+
+ if (copy_to_user(lump, lumk, lum_size))
+ rc = -EFAULT;
+
+ if (lumk != &lum)
+ OBD_FREE(lumk, lum_size);
+
+ return rc;
+}
+
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+ void *karg, void *uarg)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct obd_ioctl_data *data = karg;
+ int err = 0;
+
+ if (!try_module_get(THIS_MODULE)) {
+ CERROR("Can't get module. Is it alive?");
+ return -EINVAL;
+ }
+ switch (cmd) {
+ case OBD_IOC_LOV_GET_CONFIG: {
+ char *buf;
+ struct lov_desc *desc;
+ struct obd_uuid uuid;
+
+ buf = NULL;
+ len = 0;
+ if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ data = (struct obd_ioctl_data *)buf;
+
+ if (sizeof(*desc) > data->ioc_inllen1) {
+ obd_ioctl_freedata(buf, len);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (data->ioc_inllen2 < sizeof(uuid)) {
+ obd_ioctl_freedata(buf, len);
+ err = -EINVAL;
+ goto out;
+ }
+
+ desc = (struct lov_desc *)data->ioc_inlbuf1;
+ desc->ld_tgt_count = 1;
+ desc->ld_active_tgt_count = 1;
+ desc->ld_default_stripe_count = 1;
+ desc->ld_default_stripe_size = 0;
+ desc->ld_default_stripe_offset = 0;
+ desc->ld_pattern = 0;
+ memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid));
+
+ memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid));
+
+ err = copy_to_user((void *)uarg, buf, len);
+ if (err)
+ err = -EFAULT;
+ obd_ioctl_freedata(buf, len);
+ goto out;
+ }
+ case LL_IOC_LOV_SETSTRIPE:
+ err = obd_alloc_memmd(exp, karg);
+ if (err > 0)
+ err = 0;
+ goto out;
+ case LL_IOC_LOV_GETSTRIPE:
+ err = osc_getstripe(karg, uarg);
+ goto out;
+ case OBD_IOC_CLIENT_RECOVER:
+ err = ptlrpc_recover_import(obd->u.cli.cl_import,
+ data->ioc_inlbuf1, 0);
+ if (err > 0)
+ err = 0;
+ goto out;
+ case IOC_OSC_SET_ACTIVE:
+ err = ptlrpc_set_import_active(obd->u.cli.cl_import,
+ data->ioc_offset);
+ goto out;
+ case OBD_IOC_POLL_QUOTACHECK:
+ err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+ goto out;
+ case OBD_IOC_PING_TARGET:
+ err = ptlrpc_obd_ping(obd);
+ goto out;
+ default:
+ CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
+ cmd, current_comm());
+ err = -ENOTTY;
+ goto out;
+ }
+out:
+ module_put(THIS_MODULE);
+ return err;
+}
+
+static int osc_get_info(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, __u32 *vallen, void *val,
+ struct lov_stripe_md *lsm)
+{
+ if (!vallen || !val)
+ return -EFAULT;
+
+ if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+ __u32 *stripe = val;
+ *vallen = sizeof(*stripe);
+ *stripe = 0;
+ return 0;
+ } else if (KEY_IS(KEY_LAST_ID)) {
+ struct ptlrpc_request *req;
+ u64 *reply;
+ char *tmp;
+ int rc;
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_OST_GET_INFO_LAST_ID);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+ RCL_CLIENT, keylen);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+ memcpy(tmp, key, keylen);
+
+ req->rq_no_delay = req->rq_no_resend = 1;
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+ if (reply == NULL) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ *((u64 *)val) = *reply;
+ out:
+ ptlrpc_req_finished(req);
+ return rc;
+ } else if (KEY_IS(KEY_FIEMAP)) {
+ struct ll_fiemap_info_key *fm_key =
+ (struct ll_fiemap_info_key *)key;
+ struct ldlm_res_id res_id;
+ ldlm_policy_data_t policy;
+ struct lustre_handle lockh;
+ ldlm_mode_t mode = 0;
+ struct ptlrpc_request *req;
+ struct ll_user_fiemap *reply;
+ char *tmp;
+ int rc;
+
+ if (!(fm_key->fiemap.fm_flags & FIEMAP_FLAG_SYNC))
+ goto skip_locking;
+
+ policy.l_extent.start = fm_key->fiemap.fm_start &
+ CFS_PAGE_MASK;
+
+ if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <=
+ fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1)
+ policy.l_extent.end = OBD_OBJECT_EOF;
+ else
+ policy.l_extent.end = (fm_key->fiemap.fm_start +
+ fm_key->fiemap.fm_length +
+ PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK;
+
+ ostid_build_res_name(&fm_key->oa.o_oi, &res_id);
+ mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+ LDLM_FL_BLOCK_GRANTED |
+ LDLM_FL_LVB_READY,
+ &res_id, LDLM_EXTENT, &policy,
+ LCK_PR | LCK_PW, &lockh, 0);
+ if (mode) { /* lock is cached on client */
+ if (mode != LCK_PR) {
+ ldlm_lock_addref(&lockh, LCK_PR);
+ ldlm_lock_decref(&lockh, LCK_PW);
+ }
+ } else { /* no cached lock, needs acquire lock on server side */
+ fm_key->oa.o_valid |= OBD_MD_FLFLAGS;
+ fm_key->oa.o_flags |= OBD_FL_SRVLOCK;
+ }
+
+skip_locking:
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_OST_GET_INFO_FIEMAP);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto drop_lock;
+ }
+
+ req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY,
+ RCL_CLIENT, keylen);
+ req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+ RCL_CLIENT, *vallen);
+ req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+ RCL_SERVER, *vallen);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+ if (rc) {
+ ptlrpc_request_free(req);
+ goto drop_lock;
+ }
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+ memcpy(tmp, key, keylen);
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+ memcpy(tmp, val, *vallen);
+
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto fini_req;
+
+ reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+ if (reply == NULL) {
+ rc = -EPROTO;
+ goto fini_req;
+ }
+
+ memcpy(val, reply, *vallen);
+fini_req:
+ ptlrpc_req_finished(req);
+drop_lock:
+ if (mode)
+ ldlm_lock_decref(&lockh, LCK_PR);
+ return rc;
+ }
+
+ return -EINVAL;
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, u32 vallen,
+ void *val, struct ptlrpc_request_set *set)
+{
+ struct ptlrpc_request *req;
+ struct obd_device *obd = exp->exp_obd;
+ struct obd_import *imp = class_exp2cliimp(exp);
+ char *tmp;
+ int rc;
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+ if (KEY_IS(KEY_CHECKSUM)) {
+ if (vallen != sizeof(int))
+ return -EINVAL;
+ exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+ return 0;
+ }
+
+ if (KEY_IS(KEY_SPTLRPC_CONF)) {
+ sptlrpc_conf_client_adapt(obd);
+ return 0;
+ }
+
+ if (KEY_IS(KEY_FLUSH_CTX)) {
+ sptlrpc_import_flush_my_ctx(imp);
+ return 0;
+ }
+
+ if (KEY_IS(KEY_CACHE_SET)) {
+ struct client_obd *cli = &obd->u.cli;
+
+ LASSERT(cli->cl_cache == NULL); /* only once */
+ cli->cl_cache = (struct cl_client_cache *)val;
+ atomic_inc(&cli->cl_cache->ccc_users);
+ cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+ /* add this osc into entity list */
+ LASSERT(list_empty(&cli->cl_lru_osc));
+ spin_lock(&cli->cl_cache->ccc_lru_lock);
+ list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+ spin_unlock(&cli->cl_cache->ccc_lru_lock);
+
+ return 0;
+ }
+
+ if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+ struct client_obd *cli = &obd->u.cli;
+ int nr = atomic_read(&cli->cl_lru_in_list) >> 1;
+ int target = *(int *)val;
+
+ nr = osc_lru_shrink(cli, min(nr, target));
+ *(int *)val -= nr;
+ return 0;
+ }
+
+ if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+ return -EINVAL;
+
+ /* We pass all other commands directly to OST. Since nobody calls osc
+ methods directly and everybody is supposed to go through LOV, we
+ assume lov checked invalid values for us.
+ The only recognised values so far are evict_by_nid and mds_conn.
+ Even if something bad goes through, we'd get a -EINVAL from OST
+ anyway. */
+
+ req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+ &RQF_OST_SET_GRANT_INFO :
+ &RQF_OBD_SET_INFO);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+ RCL_CLIENT, keylen);
+ if (!KEY_IS(KEY_GRANT_SHRINK))
+ req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+ RCL_CLIENT, vallen);
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+ memcpy(tmp, key, keylen);
+ tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+ &RMF_OST_BODY :
+ &RMF_SETINFO_VAL);
+ memcpy(tmp, val, vallen);
+
+ if (KEY_IS(KEY_GRANT_SHRINK)) {
+ struct osc_brw_async_args *aa;
+ struct obdo *oa;
+
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ OBDO_ALLOC(oa);
+ if (!oa) {
+ ptlrpc_req_finished(req);
+ return -ENOMEM;
+ }
+ *oa = ((struct ost_body *)val)->oa;
+ aa->aa_oa = oa;
+ req->rq_interpret_reply = osc_shrink_grant_interpret;
+ }
+
+ ptlrpc_request_set_replen(req);
+ if (!KEY_IS(KEY_GRANT_SHRINK)) {
+ LASSERT(set != NULL);
+ ptlrpc_set_add_req(set, req);
+ ptlrpc_check_set(NULL, set);
+ } else
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+ return 0;
+}
+
+static int osc_reconnect(const struct lu_env *env,
+ struct obd_export *exp, struct obd_device *obd,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *data,
+ void *localdata)
+{
+ struct client_obd *cli = &obd->u.cli;
+
+ if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+ long lost_grant;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?:
+ 2 * cli_brw_size(obd);
+ lost_grant = cli->cl_lost_grant;
+ cli->cl_lost_grant = 0;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+ CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d, lost: %ld.\n",
+ data->ocd_connect_flags,
+ data->ocd_version, data->ocd_grant, lost_grant);
+ }
+
+ return 0;
+}
+
+static int osc_disconnect(struct obd_export *exp)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ int rc;
+
+ rc = client_disconnect_export(exp);
+ /**
+ * Initially we put del_shrink_grant before disconnect_export, but it
+ * causes the following problem if setup (connect) and cleanup
+ * (disconnect) are tangled together.
+ * connect p1 disconnect p2
+ * ptlrpc_connect_import
+ * ............... class_manual_cleanup
+ * osc_disconnect
+ * del_shrink_grant
+ * ptlrpc_connect_interrupt
+ * init_grant_shrink
+ * add this client to shrink list
+ * cleanup_osc
+ * Bang! pinger trigger the shrink.
+ * So the osc should be disconnected from the shrink list, after we
+ * are sure the import has been destroyed. BUG18662
+ */
+ if (obd->u.cli.cl_import == NULL)
+ osc_del_shrink_grant(&obd->u.cli);
+ return rc;
+}
+
+static int osc_import_event(struct obd_device *obd,
+ struct obd_import *imp,
+ enum obd_import_event event)
+{
+ struct client_obd *cli;
+ int rc = 0;
+
+ LASSERT(imp->imp_obd == obd);
+
+ switch (event) {
+ case IMP_EVENT_DISCON: {
+ cli = &obd->u.cli;
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ cli->cl_avail_grant = 0;
+ cli->cl_lost_grant = 0;
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ break;
+ }
+ case IMP_EVENT_INACTIVE: {
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+ break;
+ }
+ case IMP_EVENT_INVALIDATE: {
+ struct ldlm_namespace *ns = obd->obd_namespace;
+ struct lu_env *env;
+ int refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ /* Reset grants */
+ cli = &obd->u.cli;
+ /* all pages go to failing rpcs due to the invalid
+ * import */
+ osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND);
+
+ ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+ cl_env_put(env, &refcheck);
+ } else
+ rc = PTR_ERR(env);
+ break;
+ }
+ case IMP_EVENT_ACTIVE: {
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+ break;
+ }
+ case IMP_EVENT_OCD: {
+ struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+ if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+ osc_init_grant(&obd->u.cli, ocd);
+
+ /* See bug 7198 */
+ if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+ imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+ break;
+ }
+ case IMP_EVENT_DEACTIVATE: {
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL);
+ break;
+ }
+ case IMP_EVENT_ACTIVATE: {
+ rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL);
+ break;
+ }
+ default:
+ CERROR("Unknown import event %d\n", event);
+ LBUG();
+ }
+ return rc;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+ check_res_locked(lock->l_resource);
+
+ /*
+ * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR.
+ *
+ * XXX as a future improvement, we can also cancel unused write lock
+ * if it doesn't have dirty data and active mmaps.
+ */
+ if (lock->l_resource->lr_type == LDLM_EXTENT &&
+ (lock->l_granted_mode == LCK_PR ||
+ lock->l_granted_mode == LCK_CR) &&
+ (osc_dlm_lock_pageref(lock) == 0))
+ return 1;
+
+ return 0;
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+ struct client_obd *cli = data;
+
+ CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+ osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+ return 0;
+}
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct lprocfs_static_vars lvars = { NULL };
+ struct client_obd *cli = &obd->u.cli;
+ void *handler;
+ int rc;
+
+ rc = ptlrpcd_addref();
+ if (rc)
+ return rc;
+
+ rc = client_obd_setup(obd, lcfg);
+ if (rc)
+ goto out_ptlrpcd;
+
+ handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+ if (IS_ERR(handler)) {
+ rc = PTR_ERR(handler);
+ goto out_client_setup;
+ }
+ cli->cl_writeback_work = handler;
+
+ rc = osc_quota_setup(obd);
+ if (rc)
+ goto out_ptlrpcd_work;
+
+ cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+ lprocfs_osc_init_vars(&lvars);
+ if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
+ lproc_osc_attach_seqstat(obd);
+ sptlrpc_lprocfs_cliobd_attach(obd);
+ ptlrpc_lprocfs_register_obd(obd);
+ }
+
+ /* We need to allocate a few requests more, because
+ * brw_interpret tries to create new requests before freeing
+ * previous ones, Ideally we want to have 2x max_rpcs_in_flight
+ * reserved, but I'm afraid that might be too much wasted RAM
+ * in fact, so 2 is just my guess and still should work. */
+ cli->cl_import->imp_rq_pool =
+ ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+ OST_MAXREQSIZE,
+ ptlrpc_add_rqs_to_pool);
+
+ INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+ ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery);
+ return rc;
+
+out_ptlrpcd_work:
+ ptlrpcd_destroy_work(handler);
+out_client_setup:
+ client_obd_cleanup(obd);
+out_ptlrpcd:
+ ptlrpcd_decref();
+ return rc;
+}
+
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+ switch (stage) {
+ case OBD_CLEANUP_EARLY: {
+ struct obd_import *imp;
+ imp = obd->u.cli.cl_import;
+ CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name);
+ /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */
+ ptlrpc_deactivate_import(imp);
+ spin_lock(&imp->imp_lock);
+ imp->imp_pingable = 0;
+ spin_unlock(&imp->imp_lock);
+ break;
+ }
+ case OBD_CLEANUP_EXPORTS: {
+ struct client_obd *cli = &obd->u.cli;
+ /* LU-464
+ * for echo client, export may be on zombie list, wait for
+ * zombie thread to cull it, because cli.cl_import will be
+ * cleared in client_disconnect_export():
+ * class_export_destroy() -> obd_cleanup() ->
+ * echo_device_free() -> echo_client_cleanup() ->
+ * obd_disconnect() -> osc_disconnect() ->
+ * client_disconnect_export()
+ */
+ obd_zombie_barrier();
+ if (cli->cl_writeback_work) {
+ ptlrpcd_destroy_work(cli->cl_writeback_work);
+ cli->cl_writeback_work = NULL;
+ }
+ obd_cleanup_client_import(obd);
+ ptlrpc_lprocfs_unregister_obd(obd);
+ lprocfs_obd_cleanup(obd);
+ break;
+ }
+ }
+ return 0;
+}
+
+int osc_cleanup(struct obd_device *obd)
+{
+ struct client_obd *cli = &obd->u.cli;
+ int rc;
+
+ /* lru cleanup */
+ if (cli->cl_cache != NULL) {
+ LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+ spin_lock(&cli->cl_cache->ccc_lru_lock);
+ list_del_init(&cli->cl_lru_osc);
+ spin_unlock(&cli->cl_cache->ccc_lru_lock);
+ cli->cl_lru_left = NULL;
+ atomic_dec(&cli->cl_cache->ccc_users);
+ cli->cl_cache = NULL;
+ }
+
+ /* free memory of osc quota cache */
+ osc_quota_cleanup(obd);
+
+ rc = client_obd_cleanup(obd);
+
+ ptlrpcd_decref();
+ return rc;
+}
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct lprocfs_static_vars lvars = { NULL };
+ int rc = 0;
+
+ lprocfs_osc_init_vars(&lvars);
+
+ switch (lcfg->lcfg_command) {
+ default:
+ rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
+ lcfg, obd);
+ if (rc > 0)
+ rc = 0;
+ break;
+ }
+
+ return rc;
+}
+
+static int osc_process_config(struct obd_device *obd, u32 len, void *buf)
+{
+ return osc_process_config_base(obd, buf);
+}
+
+struct obd_ops osc_obd_ops = {
+ .o_owner = THIS_MODULE,
+ .o_setup = osc_setup,
+ .o_precleanup = osc_precleanup,
+ .o_cleanup = osc_cleanup,
+ .o_add_conn = client_import_add_conn,
+ .o_del_conn = client_import_del_conn,
+ .o_connect = client_connect_import,
+ .o_reconnect = osc_reconnect,
+ .o_disconnect = osc_disconnect,
+ .o_statfs = osc_statfs,
+ .o_statfs_async = osc_statfs_async,
+ .o_packmd = osc_packmd,
+ .o_unpackmd = osc_unpackmd,
+ .o_create = osc_create,
+ .o_destroy = osc_destroy,
+ .o_getattr = osc_getattr,
+ .o_getattr_async = osc_getattr_async,
+ .o_setattr = osc_setattr,
+ .o_setattr_async = osc_setattr_async,
+ .o_find_cbdata = osc_find_cbdata,
+ .o_iocontrol = osc_iocontrol,
+ .o_get_info = osc_get_info,
+ .o_set_info_async = osc_set_info_async,
+ .o_import_event = osc_import_event,
+ .o_process_config = osc_process_config,
+ .o_quotactl = osc_quotactl,
+ .o_quotacheck = osc_quotacheck,
+};
+
+extern struct lu_kmem_descr osc_caches[];
+extern spinlock_t osc_ast_guard;
+extern struct lock_class_key osc_ast_guard_class;
+
+static int __init osc_init(void)
+{
+ struct lprocfs_static_vars lvars = { NULL };
+ int rc;
+
+ /* print an address of _any_ initialized kernel symbol from this
+ * module, to allow debugging with gdb that doesn't support data
+ * symbols from modules.*/
+ CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+ rc = lu_kmem_init(osc_caches);
+ if (rc)
+ return rc;
+
+ lprocfs_osc_init_vars(&lvars);
+
+ rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
+ LUSTRE_OSC_NAME, &osc_device_type);
+ if (rc) {
+ lu_kmem_fini(osc_caches);
+ return rc;
+ }
+
+ spin_lock_init(&osc_ast_guard);
+ lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
+
+ return rc;
+}
+
+static void /*__exit*/ osc_exit(void)
+{
+ class_unregister_type(LUSTRE_OSC_NAME);
+ lu_kmem_fini(osc_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(LUSTRE_VERSION_STRING);
+
+module_init(osc_init);
+module_exit(osc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/Makefile b/drivers/staging/lustre/lustre/ptlrpc/Makefile
new file mode 100644
index 000000000..fb50cd4c6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/Makefile
@@ -0,0 +1,20 @@
+obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o
+LDLM := ../../lustre/ldlm/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o
+ldlm_objs += $(LDLM)interval_tree.o
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o
+
+ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs)
+ptlrpc-$(CONFIG_PROC_FS) += sec_lproc.o
+ptlrpc-$(CONFIG_LUSTRE_TRANSLATE_ERRNOS) += errno.o
diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
new file mode 100644
index 000000000..0357f1d45
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/client.c
@@ -0,0 +1,3149 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_req_layout.h"
+
+#include "ptlrpc_internal.h"
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+static int ptlrpcd_check_work(struct ptlrpc_request *req);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+ struct ptlrpc_client *cl)
+{
+ cl->cli_request_portal = req_portal;
+ cl->cli_reply_portal = rep_portal;
+ cl->cli_name = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remote uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
+{
+ struct ptlrpc_connection *c;
+ lnet_nid_t self;
+ lnet_process_id_t peer;
+ int err;
+
+ /* ptlrpc_uuid_to_peer() initializes its 2nd parameter
+ * before accessing its values. */
+ /* coverity[uninit_use_in_call] */
+ err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+ if (err != 0) {
+ CNETERR("cannot find peer %s!\n", uuid->uuid);
+ return NULL;
+ }
+
+ c = ptlrpc_connection_get(peer, self, uuid);
+ if (c) {
+ memcpy(c->c_remote_uuid.uuid,
+ uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+ }
+
+ CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+ return c;
+}
+EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+ unsigned type, unsigned portal)
+{
+ struct ptlrpc_bulk_desc *desc;
+ int i;
+
+ OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]));
+ if (!desc)
+ return NULL;
+
+ spin_lock_init(&desc->bd_lock);
+ init_waitqueue_head(&desc->bd_waitq);
+ desc->bd_max_iov = npages;
+ desc->bd_iov_count = 0;
+ desc->bd_portal = portal;
+ desc->bd_type = type;
+ desc->bd_md_count = 0;
+ LASSERT(max_brw > 0);
+ desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+ /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+ * node. Negotiated ocd_brw_size will always be <= this number. */
+ for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+ LNetInvalidateHandle(&desc->bd_mds[i]);
+
+ return desc;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a npages * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocated initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+ unsigned npages, unsigned max_brw,
+ unsigned type, unsigned portal)
+{
+ struct obd_import *imp = req->rq_import;
+ struct ptlrpc_bulk_desc *desc;
+
+ LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
+ desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
+ if (desc == NULL)
+ return NULL;
+
+ desc->bd_import_generation = req->rq_import_generation;
+ desc->bd_import = class_import_get(imp);
+ desc->bd_req = req;
+
+ desc->bd_cbid.cbid_fn = client_bulk_callback;
+ desc->bd_cbid.cbid_arg = desc;
+
+ /* This makes req own desc, and free it when she frees herself */
+ req->rq_bulk = desc;
+
+ return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+/**
+ * Add a page \a page to the bulk descriptor \a desc.
+ * Data to transfer in the page starts at offset \a pageoffset and
+ * amount of data to transfer from the page is \a len
+ */
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset, int len, int pin)
+{
+ LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+ LASSERT(page != NULL);
+ LASSERT(pageoffset >= 0);
+ LASSERT(len > 0);
+ LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
+
+ desc->bd_nob += len;
+
+ if (pin)
+ page_cache_get(page);
+
+ ptlrpc_add_bulk_page(desc, page, pageoffset, len);
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+/**
+ * Uninitialize and free bulk descriptor \a desc.
+ * Works on bulk descriptors both from server and client side.
+ */
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
+{
+ int i;
+
+ LASSERT(desc != NULL);
+ LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+ LASSERT(desc->bd_md_count == 0); /* network hands off */
+ LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+
+ sptlrpc_enc_pool_put_pages(desc);
+
+ if (desc->bd_export)
+ class_export_put(desc->bd_export);
+ else
+ class_import_put(desc->bd_import);
+
+ if (unpin) {
+ for (i = 0; i < desc->bd_iov_count; i++)
+ page_cache_release(desc->bd_iov[i].kiov_page);
+ }
+
+ OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
+ bd_iov[desc->bd_max_iov]));
+}
+EXPORT_SYMBOL(__ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+ __u32 serv_est;
+ int idx;
+ struct imp_at *at;
+
+ LASSERT(req->rq_import);
+
+ if (AT_OFF) {
+ /* non-AT settings */
+ /**
+ * \a imp_server_timeout means this is reverse import and
+ * we send (currently only) ASTs to the client and cannot afford
+ * to wait too long for the reply, otherwise the other client
+ * (because of which we are sending this request) would
+ * timeout waiting for us
+ */
+ req->rq_timeout = req->rq_import->imp_server_timeout ?
+ obd_timeout / 2 : obd_timeout;
+ } else {
+ at = &req->rq_import->imp_at;
+ idx = import_at_get_index(req->rq_import,
+ req->rq_request_portal);
+ serv_est = at_get(&at->iat_service_estimate[idx]);
+ req->rq_timeout = at_est2timeout(serv_est);
+ }
+ /* We could get even fancier here, using history to predict increased
+ loading... */
+
+ /* Let the server know what this RPC timeout is by putting it in the
+ reqmsg*/
+ lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+ unsigned int serv_est)
+{
+ int idx;
+ unsigned int oldse;
+ struct imp_at *at;
+
+ LASSERT(req->rq_import);
+ at = &req->rq_import->imp_at;
+
+ idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+ /* max service estimates are tracked on the server side,
+ so just keep minimal history here */
+ oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+ if (oldse != 0)
+ CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
+ req->rq_import->imp_obd->obd_name, req->rq_request_portal,
+ oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+ return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+ unsigned int service_time)
+{
+ unsigned int nl, oldnl;
+ struct imp_at *at;
+ time_t now = get_seconds();
+
+ LASSERT(req->rq_import);
+
+ if (service_time > now - req->rq_sent + 3) {
+ /* bz16408, however, this can also happen if early reply
+ * is lost and client RPC is expired and resent, early reply
+ * or reply of original RPC can still be fit in reply buffer
+ * of resent RPC, now client is measuring time from the
+ * resent time, but server sent back service time of original
+ * RPC.
+ */
+ CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
+ D_ADAPTTO : D_WARNING,
+ "Reported service time %u > total measured time "
+ CFS_DURATION_T"\n", service_time,
+ cfs_time_sub(now, req->rq_sent));
+ return;
+ }
+
+ /* Network latency is total time less server processing time */
+ nl = max_t(int, now - req->rq_sent -
+ service_time, 0) + 1; /* st rounding */
+ at = &req->rq_import->imp_at;
+
+ oldnl = at_measured(&at->iat_net_latency, nl);
+ if (oldnl != 0)
+ CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n",
+ req->rq_import->imp_obd->obd_name,
+ obd_uuid2str(
+ &req->rq_import->imp_connection->c_remote_uuid),
+ oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+ int rc;
+
+ if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+ rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+ return -EPROTO;
+ }
+ }
+
+ rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+ return -EPROTO;
+ }
+ return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_request *early_req;
+ time_t olddl;
+ int rc;
+
+ req->rq_early = 0;
+ spin_unlock(&req->rq_lock);
+
+ rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+ if (rc) {
+ spin_lock(&req->rq_lock);
+ return rc;
+ }
+
+ rc = unpack_reply(early_req);
+ if (rc == 0) {
+ /* Expecting to increase the service time estimate here */
+ ptlrpc_at_adj_service(req,
+ lustre_msg_get_timeout(early_req->rq_repmsg));
+ ptlrpc_at_adj_net_latency(req,
+ lustre_msg_get_service_time(early_req->rq_repmsg));
+ }
+
+ sptlrpc_cli_finish_early_reply(early_req);
+
+ if (rc != 0) {
+ spin_lock(&req->rq_lock);
+ return rc;
+ }
+
+ /* Adjust the local timeout for this req */
+ ptlrpc_at_set_req_timeout(req);
+
+ spin_lock(&req->rq_lock);
+ olddl = req->rq_deadline;
+ /* server assumes it now has rq_timeout from when it sent the
+ * early reply, so client should give it at least that long. */
+ req->rq_deadline = get_seconds() + req->rq_timeout +
+ ptlrpc_at_get_net_latency(req);
+
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply #%d, new deadline in " CFS_DURATION_T "s (" CFS_DURATION_T "s)",
+ req->rq_early_count,
+ cfs_time_sub(req->rq_deadline, get_seconds()),
+ cfs_time_sub(req->rq_deadline, olddl));
+
+ return rc;
+}
+
+struct kmem_cache *request_cache;
+
+int ptlrpc_request_cache_init(void)
+{
+ request_cache = kmem_cache_create("ptlrpc_cache",
+ sizeof(struct ptlrpc_request),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ return request_cache == NULL ? -ENOMEM : 0;
+}
+
+void ptlrpc_request_cache_fini(void)
+{
+ kmem_cache_destroy(request_cache);
+}
+
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
+{
+ struct ptlrpc_request *req;
+
+ OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
+ return req;
+}
+
+void ptlrpc_request_cache_free(struct ptlrpc_request *req)
+{
+ OBD_SLAB_FREE_PTR(req, request_cache);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+ struct list_head *l, *tmp;
+ struct ptlrpc_request *req;
+
+ LASSERT(pool != NULL);
+
+ spin_lock(&pool->prp_lock);
+ list_for_each_safe(l, tmp, &pool->prp_req_list) {
+ req = list_entry(l, struct ptlrpc_request, rq_list);
+ list_del(&req->rq_list);
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+ OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+ ptlrpc_request_cache_free(req);
+ }
+ spin_unlock(&pool->prp_lock);
+ OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+ int i;
+ int size = 1;
+
+ while (size < pool->prp_rq_size)
+ size <<= 1;
+
+ LASSERTF(list_empty(&pool->prp_req_list) ||
+ size == pool->prp_rq_size,
+ "Trying to change pool size with nonempty pool from %d to %d bytes\n",
+ pool->prp_rq_size, size);
+
+ spin_lock(&pool->prp_lock);
+ pool->prp_rq_size = size;
+ for (i = 0; i < num_rq; i++) {
+ struct ptlrpc_request *req;
+ struct lustre_msg *msg;
+
+ spin_unlock(&pool->prp_lock);
+ req = ptlrpc_request_cache_alloc(GFP_NOFS);
+ if (!req)
+ return;
+ OBD_ALLOC_LARGE(msg, size);
+ if (!msg) {
+ ptlrpc_request_cache_free(req);
+ return;
+ }
+ req->rq_reqbuf = msg;
+ req->rq_reqbuf_len = size;
+ req->rq_pool = pool;
+ spin_lock(&pool->prp_lock);
+ list_add_tail(&req->rq_list, &pool->prp_req_list);
+ }
+ spin_unlock(&pool->prp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ * to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+ void (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+ struct ptlrpc_request_pool *pool;
+
+ OBD_ALLOC(pool, sizeof(struct ptlrpc_request_pool));
+ if (!pool)
+ return NULL;
+
+ /* Request next power of two for the allocation, because internally
+ kernel would do exactly this */
+
+ spin_lock_init(&pool->prp_lock);
+ INIT_LIST_HEAD(&pool->prp_req_list);
+ pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+ pool->prp_populate = populate_pool;
+
+ populate_pool(pool, num_rq);
+
+ if (list_empty(&pool->prp_req_list)) {
+ /* have not allocated a single request for the pool */
+ OBD_FREE(pool, sizeof(struct ptlrpc_request_pool));
+ pool = NULL;
+ }
+ return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+ struct ptlrpc_request *request;
+ struct lustre_msg *reqbuf;
+
+ if (!pool)
+ return NULL;
+
+ spin_lock(&pool->prp_lock);
+
+ /* See if we have anything in a pool, and bail out if nothing,
+ * in writeout path, where this matters, this is safe to do, because
+ * nothing is lost in this case, and when some in-flight requests
+ * complete, this code will be called again. */
+ if (unlikely(list_empty(&pool->prp_req_list))) {
+ spin_unlock(&pool->prp_lock);
+ return NULL;
+ }
+
+ request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
+ rq_list);
+ list_del_init(&request->rq_list);
+ spin_unlock(&pool->prp_lock);
+
+ LASSERT(request->rq_reqbuf);
+ LASSERT(request->rq_pool);
+
+ reqbuf = request->rq_reqbuf;
+ memset(request, 0, sizeof(*request));
+ request->rq_reqbuf = reqbuf;
+ request->rq_reqbuf_len = pool->prp_rq_size;
+ request->rq_pool = pool;
+
+ return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+ struct ptlrpc_request_pool *pool = request->rq_pool;
+
+ spin_lock(&pool->prp_lock);
+ LASSERT(list_empty(&request->rq_list));
+ LASSERT(!request->rq_receiving_reply);
+ list_add_tail(&request->rq_list, &pool->prp_req_list);
+ spin_unlock(&pool->prp_lock);
+}
+
+static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+ __u32 version, int opcode,
+ int count, __u32 *lengths, char **bufs,
+ struct ptlrpc_cli_ctx *ctx)
+{
+ struct obd_import *imp = request->rq_import;
+ int rc;
+
+ if (unlikely(ctx))
+ request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+ else {
+ rc = sptlrpc_req_get_ctx(request);
+ if (rc)
+ goto out_free;
+ }
+
+ sptlrpc_req_set_flavor(request, opcode);
+
+ rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+ lengths, bufs);
+ if (rc) {
+ LASSERT(!request->rq_pool);
+ goto out_ctx;
+ }
+
+ lustre_msg_add_version(request->rq_reqmsg, version);
+ request->rq_send_state = LUSTRE_IMP_FULL;
+ request->rq_type = PTL_RPC_MSG_REQUEST;
+ request->rq_export = NULL;
+
+ request->rq_req_cbid.cbid_fn = request_out_callback;
+ request->rq_req_cbid.cbid_arg = request;
+
+ request->rq_reply_cbid.cbid_fn = reply_in_callback;
+ request->rq_reply_cbid.cbid_arg = request;
+
+ request->rq_reply_deadline = 0;
+ request->rq_phase = RQ_PHASE_NEW;
+ request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+ request->rq_request_portal = imp->imp_client->cli_request_portal;
+ request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+ ptlrpc_at_set_req_timeout(request);
+
+ spin_lock_init(&request->rq_lock);
+ INIT_LIST_HEAD(&request->rq_list);
+ INIT_LIST_HEAD(&request->rq_timed_list);
+ INIT_LIST_HEAD(&request->rq_replay_list);
+ INIT_LIST_HEAD(&request->rq_ctx_chain);
+ INIT_LIST_HEAD(&request->rq_set_chain);
+ INIT_LIST_HEAD(&request->rq_history_list);
+ INIT_LIST_HEAD(&request->rq_exp_list);
+ init_waitqueue_head(&request->rq_reply_waitq);
+ init_waitqueue_head(&request->rq_set_waitq);
+ request->rq_xid = ptlrpc_next_xid();
+ atomic_set(&request->rq_refcount, 1);
+
+ lustre_msg_set_opc(request->rq_reqmsg, opcode);
+
+ return 0;
+out_ctx:
+ sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+ class_import_put(imp);
+ return rc;
+}
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+ __u32 version, int opcode, char **bufs,
+ struct ptlrpc_cli_ctx *ctx)
+{
+ int count;
+
+ count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+ return __ptlrpc_request_bufs_pack(request, version, opcode, count,
+ request->rq_pill.rc_area[RCL_CLIENT],
+ bufs, ctx);
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+ __u32 version, int opcode)
+{
+ int rc;
+ rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+ if (rc)
+ return rc;
+
+ /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
+ * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
+ * have to send old ptlrpc_body to keep interoperability with these
+ * clients.
+ *
+ * Only three kinds of server->client RPCs so far:
+ * - LDLM_BL_CALLBACK
+ * - LDLM_CP_CALLBACK
+ * - LDLM_GL_CALLBACK
+ *
+ * XXX This should be removed whenever we drop the interoperability with
+ * the these old clients.
+ */
+ if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
+ opcode == LDLM_GL_CALLBACK)
+ req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
+ sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+ struct ptlrpc_request_pool *pool)
+{
+ struct ptlrpc_request *request = NULL;
+
+ if (pool)
+ request = ptlrpc_prep_req_from_pool(pool);
+
+ if (!request)
+ request = ptlrpc_request_cache_alloc(GFP_NOFS);
+
+ if (request) {
+ LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
+ LASSERT(imp != LP_POISON);
+ LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+ imp->imp_client);
+ LASSERT(imp->imp_client != LP_POISON);
+
+ request->rq_import = class_import_get(imp);
+ } else {
+ CERROR("request allocation out of memory\n");
+ }
+
+ return request;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request structure and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+ struct ptlrpc_request_pool *pool,
+ const struct req_format *format)
+{
+ struct ptlrpc_request *request;
+
+ request = __ptlrpc_request_alloc(imp, pool);
+ if (request == NULL)
+ return NULL;
+
+ req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+ req_capsule_set(&request->rq_pill, format);
+ return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+ const struct req_format *format)
+{
+ return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+ struct ptlrpc_request_pool *pool,
+ const struct req_format *format)
+{
+ return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+ if (request->rq_pool)
+ __ptlrpc_free_req_to_pool(request);
+ else
+ ptlrpc_request_cache_free(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operation \a opcode and immediately pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+ const struct req_format *format,
+ __u32 version, int opcode)
+{
+ struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+ int rc;
+
+ if (req) {
+ rc = ptlrpc_request_pack(req, version, opcode);
+ if (rc) {
+ ptlrpc_request_free(req);
+ req = NULL;
+ }
+ }
+ return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Prepare request (fetched from pool \a pool if not NULL) on import \a imp
+ * for operation \a opcode. Request would contain \a count buffers.
+ * Sizes of buffers are described in array \a lengths and buffers themselves
+ * are provided by a pointer \a bufs.
+ * Returns prepared request structure pointer or NULL on error.
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req_pool(struct obd_import *imp,
+ __u32 version, int opcode,
+ int count, __u32 *lengths, char **bufs,
+ struct ptlrpc_request_pool *pool)
+{
+ struct ptlrpc_request *request;
+ int rc;
+
+ request = __ptlrpc_request_alloc(imp, pool);
+ if (!request)
+ return NULL;
+
+ rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
+ lengths, bufs, NULL);
+ if (rc) {
+ ptlrpc_request_free(request);
+ request = NULL;
+ }
+ return request;
+}
+EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+
+/**
+ * Same as ptlrpc_prep_req_pool, but without pool
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
+ __u32 *lengths, char **bufs)
+{
+ return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
+ NULL);
+}
+EXPORT_SYMBOL(ptlrpc_prep_req);
+
+/**
+ * Allocate and initialize new request set structure.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+ struct ptlrpc_request_set *set;
+
+ OBD_ALLOC(set, sizeof(*set));
+ if (!set)
+ return NULL;
+ atomic_set(&set->set_refcount, 1);
+ INIT_LIST_HEAD(&set->set_requests);
+ init_waitqueue_head(&set->set_waitq);
+ atomic_set(&set->set_new_count, 0);
+ atomic_set(&set->set_remaining, 0);
+ spin_lock_init(&set->set_new_req_lock);
+ INIT_LIST_HEAD(&set->set_new_requests);
+ INIT_LIST_HEAD(&set->set_cblist);
+ set->set_max_inflight = UINT_MAX;
+ set->set_producer = NULL;
+ set->set_producer_arg = NULL;
+ set->set_rc = 0;
+
+ return set;
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+ void *arg)
+
+{
+ struct ptlrpc_request_set *set;
+
+ set = ptlrpc_prep_set();
+ if (!set)
+ return NULL;
+
+ set->set_max_inflight = max;
+ set->set_producer = func;
+ set->set_producer_arg = arg;
+
+ return set;
+}
+EXPORT_SYMBOL(ptlrpc_prep_fcset);
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+ struct list_head *tmp;
+ struct list_head *next;
+ int expected_phase;
+ int n = 0;
+
+ /* Requests on the set should either all be completed, or all be new */
+ expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+ RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+ list_for_each(tmp, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request,
+ rq_set_chain);
+
+ LASSERT(req->rq_phase == expected_phase);
+ n++;
+ }
+
+ LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+ atomic_read(&set->set_remaining) == n, "%d / %d\n",
+ atomic_read(&set->set_remaining), n);
+
+ list_for_each_safe(tmp, next, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request,
+ rq_set_chain);
+ list_del_init(&req->rq_set_chain);
+
+ LASSERT(req->rq_phase == expected_phase);
+
+ if (req->rq_phase == RQ_PHASE_NEW) {
+ ptlrpc_req_interpret(NULL, req, -EBADR);
+ atomic_dec(&set->set_remaining);
+ }
+
+ spin_lock(&req->rq_lock);
+ req->rq_set = NULL;
+ req->rq_invalid_rqset = 0;
+ spin_unlock(&req->rq_lock);
+
+ ptlrpc_req_finished(req);
+ }
+
+ LASSERT(atomic_read(&set->set_remaining) == 0);
+
+ ptlrpc_reqset_put(set);
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+ set_interpreter_func fn, void *data)
+{
+ struct ptlrpc_set_cbdata *cbdata;
+
+ OBD_ALLOC_PTR(cbdata);
+ if (cbdata == NULL)
+ return -ENOMEM;
+
+ cbdata->psc_interpret = fn;
+ cbdata->psc_data = data;
+ list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_set_add_cb);
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+ struct ptlrpc_request *req)
+{
+ LASSERT(list_empty(&req->rq_set_chain));
+
+ /* The set takes over the caller's request reference */
+ list_add_tail(&req->rq_set_chain, &set->set_requests);
+ req->rq_set = set;
+ atomic_inc(&set->set_remaining);
+ req->rq_queued_time = cfs_time_current();
+
+ if (req->rq_reqmsg != NULL)
+ lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+ if (set->set_producer != NULL)
+ /* If the request set has a producer callback, the RPC must be
+ * sent straight away */
+ ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+ struct ptlrpc_request *req)
+{
+ struct ptlrpc_request_set *set = pc->pc_set;
+ int count, i;
+
+ LASSERT(req->rq_set == NULL);
+ LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+ spin_lock(&set->set_new_req_lock);
+ /*
+ * The set takes over the caller's request reference.
+ */
+ req->rq_set = set;
+ req->rq_queued_time = cfs_time_current();
+ list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+ count = atomic_inc_return(&set->set_new_count);
+ spin_unlock(&set->set_new_req_lock);
+
+ /* Only need to call wakeup once for the first entry. */
+ if (count == 1) {
+ wake_up(&set->set_waitq);
+
+ /* XXX: It maybe unnecessary to wakeup all the partners. But to
+ * guarantee the async RPC can be processed ASAP, we have
+ * no other better choice. It maybe fixed in future. */
+ for (i = 0; i < pc->pc_npartners; i++)
+ wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+ }
+}
+EXPORT_SYMBOL(ptlrpc_set_add_new_req);
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code. If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+ struct ptlrpc_request *req, int *status)
+{
+ int delay = 0;
+
+ LASSERT(status != NULL);
+ *status = 0;
+
+ if (req->rq_ctx_init || req->rq_ctx_fini) {
+ /* always allow ctx init/fini rpc go through */
+ } else if (imp->imp_state == LUSTRE_IMP_NEW) {
+ DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+ *status = -EIO;
+ } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+ /* pings may safely race with umount */
+ DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+ D_HA : D_ERROR, req, "IMP_CLOSED ");
+ *status = -EIO;
+ } else if (ptlrpc_send_limit_expired(req)) {
+ /* probably doesn't need to be a D_ERROR after initial testing */
+ DEBUG_REQ(D_ERROR, req, "send limit expired ");
+ *status = -EIO;
+ } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+ imp->imp_state == LUSTRE_IMP_CONNECTING) {
+ /* allow CONNECT even if import is invalid */
+ if (atomic_read(&imp->imp_inval_count) != 0) {
+ DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+ *status = -EIO;
+ }
+ } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+ if (!imp->imp_deactive)
+ DEBUG_REQ(D_NET, req, "IMP_INVALID");
+ *status = -ESHUTDOWN; /* bz 12940 */
+ } else if (req->rq_import_generation != imp->imp_generation) {
+ DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+ *status = -EIO;
+ } else if (req->rq_send_state != imp->imp_state) {
+ /* invalidate in progress - any requests should be drop */
+ if (atomic_read(&imp->imp_inval_count) != 0) {
+ DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+ *status = -EIO;
+ } else if (imp->imp_dlm_fake || req->rq_no_delay) {
+ *status = -EWOULDBLOCK;
+ } else if (req->rq_allow_replay &&
+ (imp->imp_state == LUSTRE_IMP_REPLAY ||
+ imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+ imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+ imp->imp_state == LUSTRE_IMP_RECOVER)) {
+ DEBUG_REQ(D_HA, req, "allow during recovery.\n");
+ } else {
+ delay = 1;
+ }
+ }
+
+ return delay;
+}
+
+/**
+ * Decide if the error message regarding provided request \a req
+ * should be printed to the console or not.
+ * Makes it's decision on request status and other properties.
+ * Returns 1 to print error on the system console or 0 if not.
+ */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+ __u32 opc;
+ int err;
+
+ LASSERT(req->rq_reqmsg != NULL);
+ opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+ /* Suppress particular reconnect errors which are to be expected. No
+ * errors are suppressed for the initial connection on an import */
+ if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+ (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+ /* Suppress timed out reconnect requests */
+ if (req->rq_timedout)
+ return 0;
+
+ /* Suppress unavailable/again reconnect requests */
+ err = lustre_msg_get_status(req->rq_repmsg);
+ if (err == -ENODEV || err == -EAGAIN)
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+ int err;
+
+ err = lustre_msg_get_status(req->rq_repmsg);
+ if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+ struct obd_import *imp = req->rq_import;
+ __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+ if (ptlrpc_console_allow(req))
+ LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n",
+ imp->imp_obd->obd_name,
+ libcfs_nid2str(
+ imp->imp_connection->c_peer.nid),
+ ll_opcode2str(opc), err);
+ return err < 0 ? err : -EINVAL;
+ }
+
+ if (err < 0) {
+ DEBUG_REQ(D_INFO, req, "status is %d", err);
+ } else if (err > 0) {
+ /* XXX: translate this error from net to host */
+ DEBUG_REQ(D_INFO, req, "status is %d", err);
+ }
+
+ return err;
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+ struct lustre_msg *repmsg = req->rq_repmsg;
+ struct lustre_msg *reqmsg = req->rq_reqmsg;
+ __u64 *versions = lustre_msg_get_versions(repmsg);
+
+ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+ return;
+
+ LASSERT(versions);
+ lustre_msg_set_versions(reqmsg, versions);
+ CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
+ versions[0], versions[1]);
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return value would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+ struct obd_import *imp = req->rq_import;
+ struct obd_device *obd = req->rq_import->imp_obd;
+ int rc;
+ struct timeval work_start;
+ long timediff;
+
+ LASSERT(obd != NULL);
+ /* repbuf must be unlinked */
+ LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink);
+
+ if (req->rq_reply_truncate) {
+ if (ptlrpc_no_resend(req)) {
+ DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d",
+ req->rq_nob_received, req->rq_repbuf_len);
+ return -EOVERFLOW;
+ }
+
+ sptlrpc_cli_free_repbuf(req);
+ /* Pass the required reply buffer size (include
+ * space for early reply).
+ * NB: no need to roundup because alloc_repbuf
+ * will roundup it */
+ req->rq_replen = req->rq_nob_received;
+ req->rq_nob_received = 0;
+ spin_lock(&req->rq_lock);
+ req->rq_resend = 1;
+ spin_unlock(&req->rq_lock);
+ return 0;
+ }
+
+ /*
+ * NB Until this point, the whole of the incoming message,
+ * including buflens, status etc is in the sender's byte order.
+ */
+ rc = sptlrpc_cli_unwrap_reply(req);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
+ return rc;
+ }
+
+ /*
+ * Security layer unwrap might ask resend this request.
+ */
+ if (req->rq_resend)
+ return 0;
+
+ rc = unpack_reply(req);
+ if (rc)
+ return rc;
+
+ /* retry indefinitely on EINPROGRESS */
+ if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+ ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+ time_t now = get_seconds();
+
+ DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+ spin_lock(&req->rq_lock);
+ req->rq_resend = 1;
+ spin_unlock(&req->rq_lock);
+ req->rq_nr_resend++;
+
+ /* allocate new xid to avoid reply reconstruction */
+ if (!req->rq_bulk) {
+ /* new xid is already allocated for bulk in
+ * ptlrpc_check_set() */
+ req->rq_xid = ptlrpc_next_xid();
+ DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS");
+ }
+
+ /* Readjust the timeout for current conditions */
+ ptlrpc_at_set_req_timeout(req);
+ /* delay resend to give a chance to the server to get ready.
+ * The delay is increased by 1s on every resend and is capped to
+ * the current request timeout (i.e. obd_timeout if AT is off,
+ * or AT service time x 125% + 5s, see at_est2timeout) */
+ if (req->rq_nr_resend > req->rq_timeout)
+ req->rq_sent = now + req->rq_timeout;
+ else
+ req->rq_sent = now + req->rq_nr_resend;
+
+ return 0;
+ }
+
+ do_gettimeofday(&work_start);
+ timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
+ if (obd->obd_svc_stats != NULL) {
+ lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+ timediff);
+ ptlrpc_lprocfs_rpc_sent(req, timediff);
+ }
+
+ if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+ lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+ DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+ lustre_msg_get_type(req->rq_repmsg));
+ return -EPROTO;
+ }
+
+ if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+ CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+ ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+ ptlrpc_at_adj_net_latency(req,
+ lustre_msg_get_service_time(req->rq_repmsg));
+
+ rc = ptlrpc_check_status(req);
+ imp->imp_connect_error = rc;
+
+ if (rc) {
+ /*
+ * Either we've been evicted, or the server has failed for
+ * some reason. Try to reconnect, and if that fails, punt to
+ * the upcall.
+ */
+ if (ll_rpc_recoverable_error(rc)) {
+ if (req->rq_send_state != LUSTRE_IMP_FULL ||
+ imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+ return rc;
+ }
+ ptlrpc_request_handle_notconn(req);
+ return rc;
+ }
+ } else {
+ /*
+ * Let's look if server sent slv. Do it only for RPC with
+ * rc == 0.
+ */
+ ldlm_cli_update_pool(req);
+ }
+
+ /*
+ * Store transno in reqmsg for replay.
+ */
+ if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+ req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+ lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+ }
+
+ if (imp->imp_replayable) {
+ spin_lock(&imp->imp_lock);
+ /*
+ * No point in adding already-committed requests to the replay
+ * list, we will just remove them immediately. b=9829
+ */
+ if (req->rq_transno != 0 &&
+ (req->rq_transno >
+ lustre_msg_get_last_committed(req->rq_repmsg) ||
+ req->rq_replay)) {
+ /** version recovery */
+ ptlrpc_save_versions(req);
+ ptlrpc_retain_replayable_request(req, imp);
+ } else if (req->rq_commit_cb != NULL &&
+ list_empty(&req->rq_replay_list)) {
+ /* NB: don't call rq_commit_cb if it's already on
+ * rq_replay_list, ptlrpc_free_committed() will call
+ * it later, see LU-3618 for details */
+ spin_unlock(&imp->imp_lock);
+ req->rq_commit_cb(req);
+ spin_lock(&imp->imp_lock);
+ }
+
+ /*
+ * Replay-enabled imports return commit-status information.
+ */
+ if (lustre_msg_get_last_committed(req->rq_repmsg)) {
+ imp->imp_peer_committed_transno =
+ lustre_msg_get_last_committed(req->rq_repmsg);
+ }
+
+ ptlrpc_free_committed(imp);
+
+ if (!list_empty(&imp->imp_replay_list)) {
+ struct ptlrpc_request *last;
+
+ last = list_entry(imp->imp_replay_list.prev,
+ struct ptlrpc_request,
+ rq_replay_list);
+ /*
+ * Requests with rq_replay stay on the list even if no
+ * commit is expected.
+ */
+ if (last->rq_transno > imp->imp_peer_committed_transno)
+ ptlrpc_pinger_commit_expected(imp);
+ }
+
+ spin_unlock(&imp->imp_lock);
+ }
+
+ return rc;
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+ struct obd_import *imp = req->rq_import;
+ int rc;
+
+ LASSERT(req->rq_phase == RQ_PHASE_NEW);
+ if (req->rq_sent && (req->rq_sent > get_seconds()) &&
+ (!req->rq_generation_set ||
+ req->rq_import_generation == imp->imp_generation))
+ return 0;
+
+ ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+ spin_lock(&imp->imp_lock);
+
+ if (!req->rq_generation_set)
+ req->rq_import_generation = imp->imp_generation;
+
+ if (ptlrpc_import_delay_req(imp, req, &rc)) {
+ spin_lock(&req->rq_lock);
+ req->rq_waiting = 1;
+ spin_unlock(&req->rq_lock);
+
+ DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)",
+ lustre_msg_get_status(req->rq_reqmsg),
+ ptlrpc_import_state_name(req->rq_send_state),
+ ptlrpc_import_state_name(imp->imp_state));
+ LASSERT(list_empty(&req->rq_list));
+ list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+ atomic_inc(&req->rq_import->imp_inflight);
+ spin_unlock(&imp->imp_lock);
+ return 0;
+ }
+
+ if (rc != 0) {
+ spin_unlock(&imp->imp_lock);
+ req->rq_status = rc;
+ ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+ return rc;
+ }
+
+ LASSERT(list_empty(&req->rq_list));
+ list_add_tail(&req->rq_list, &imp->imp_sending_list);
+ atomic_inc(&req->rq_import->imp_inflight);
+ spin_unlock(&imp->imp_lock);
+
+ lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+ rc = sptlrpc_req_refresh_ctx(req, -1);
+ if (rc) {
+ if (req->rq_err) {
+ req->rq_status = rc;
+ return 1;
+ }
+ spin_lock(&req->rq_lock);
+ req->rq_wait_ctx = 1;
+ spin_unlock(&req->rq_lock);
+ return 0;
+ }
+
+ CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
+ current_comm(),
+ imp->imp_obd->obd_uuid.uuid,
+ lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+ libcfs_nid2str(imp->imp_connection->c_peer.nid),
+ lustre_msg_get_opc(req->rq_reqmsg));
+
+ rc = ptl_send_rpc(req, 0);
+ if (rc) {
+ DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
+ spin_lock(&req->rq_lock);
+ req->rq_net_err = 1;
+ spin_unlock(&req->rq_lock);
+ return rc;
+ }
+ return 0;
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+ int remaining, rc;
+
+ LASSERT(set->set_producer != NULL);
+
+ remaining = atomic_read(&set->set_remaining);
+
+ /* populate the ->set_requests list with requests until we
+ * reach the maximum number of RPCs in flight for this set */
+ while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+ rc = set->set_producer(set, set->set_producer_arg);
+ if (rc == -ENOENT) {
+ /* no more RPC to produce */
+ set->set_producer = NULL;
+ set->set_producer_arg = NULL;
+ return 0;
+ }
+ }
+
+ return (atomic_read(&set->set_remaining) - remaining);
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ *
+ * NOTE: This function contains a potential schedule point (cond_resched()).
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+ struct list_head *tmp, *next;
+ struct list_head comp_reqs;
+ int force_timer_recalc = 0;
+
+ if (atomic_read(&set->set_remaining) == 0)
+ return 1;
+
+ INIT_LIST_HEAD(&comp_reqs);
+ list_for_each_safe(tmp, next, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request,
+ rq_set_chain);
+ struct obd_import *imp = req->rq_import;
+ int unregistered = 0;
+ int rc = 0;
+
+ /* This schedule point is mainly for the ptlrpcd caller of this
+ * function. Most ptlrpc sets are not long-lived and unbounded
+ * in length, but at the least the set used by the ptlrpcd is.
+ * Since the processing time is unbounded, we need to insert an
+ * explicit schedule point to make the thread well-behaved.
+ */
+ cond_resched();
+
+ if (req->rq_phase == RQ_PHASE_NEW &&
+ ptlrpc_send_new_req(req)) {
+ force_timer_recalc = 1;
+ }
+
+ /* delayed send - skip */
+ if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+ continue;
+
+ /* delayed resend - skip */
+ if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+ req->rq_sent > get_seconds())
+ continue;
+
+ if (!(req->rq_phase == RQ_PHASE_RPC ||
+ req->rq_phase == RQ_PHASE_BULK ||
+ req->rq_phase == RQ_PHASE_INTERPRET ||
+ req->rq_phase == RQ_PHASE_UNREGISTERING ||
+ req->rq_phase == RQ_PHASE_COMPLETE)) {
+ DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+ LBUG();
+ }
+
+ if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+ LASSERT(req->rq_next_phase != req->rq_phase);
+ LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+ /*
+ * Skip processing until reply is unlinked. We
+ * can't return to pool before that and we can't
+ * call interpret before that. We need to make
+ * sure that all rdma transfers finished and will
+ * not corrupt any data.
+ */
+ if (ptlrpc_client_recv_or_unlink(req) ||
+ ptlrpc_client_bulk_active(req))
+ continue;
+
+ /*
+ * Turn fail_loc off to prevent it from looping
+ * forever.
+ */
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+ OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+ OBD_FAIL_ONCE);
+ }
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+ OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+ OBD_FAIL_ONCE);
+ }
+
+ /*
+ * Move to next phase if reply was successfully
+ * unlinked.
+ */
+ ptlrpc_rqphase_move(req, req->rq_next_phase);
+ }
+
+ if (req->rq_phase == RQ_PHASE_COMPLETE) {
+ list_move_tail(&req->rq_set_chain, &comp_reqs);
+ continue;
+ }
+
+ if (req->rq_phase == RQ_PHASE_INTERPRET)
+ goto interpret;
+
+ /*
+ * Note that this also will start async reply unlink.
+ */
+ if (req->rq_net_err && !req->rq_timedout) {
+ ptlrpc_expire_one_request(req, 1);
+
+ /*
+ * Check if we still need to wait for unlink.
+ */
+ if (ptlrpc_client_recv_or_unlink(req) ||
+ ptlrpc_client_bulk_active(req))
+ continue;
+ /* If there is no need to resend, fail it now. */
+ if (req->rq_no_resend) {
+ if (req->rq_status == 0)
+ req->rq_status = -EIO;
+ ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+ goto interpret;
+ } else {
+ continue;
+ }
+ }
+
+ if (req->rq_err) {
+ spin_lock(&req->rq_lock);
+ req->rq_replied = 0;
+ spin_unlock(&req->rq_lock);
+ if (req->rq_status == 0)
+ req->rq_status = -EIO;
+ ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+ goto interpret;
+ }
+
+ /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
+ * so it sets rq_intr regardless of individual rpc
+ * timeouts. The synchronous IO waiting path sets
+ * rq_intr irrespective of whether ptlrpcd
+ * has seen a timeout. Our policy is to only interpret
+ * interrupted rpcs after they have timed out, so we
+ * need to enforce that here.
+ */
+
+ if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+ req->rq_wait_ctx)) {
+ req->rq_status = -EINTR;
+ ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+ goto interpret;
+ }
+
+ if (req->rq_phase == RQ_PHASE_RPC) {
+ if (req->rq_timedout || req->rq_resend ||
+ req->rq_waiting || req->rq_wait_ctx) {
+ int status;
+
+ if (!ptlrpc_unregister_reply(req, 1))
+ continue;
+
+ spin_lock(&imp->imp_lock);
+ if (ptlrpc_import_delay_req(imp, req,
+ &status)) {
+ /* put on delay list - only if we wait
+ * recovery finished - before send */
+ list_del_init(&req->rq_list);
+ list_add_tail(&req->rq_list,
+ &imp->
+ imp_delayed_list);
+ spin_unlock(&imp->imp_lock);
+ continue;
+ }
+
+ if (status != 0) {
+ req->rq_status = status;
+ ptlrpc_rqphase_move(req,
+ RQ_PHASE_INTERPRET);
+ spin_unlock(&imp->imp_lock);
+ goto interpret;
+ }
+ if (ptlrpc_no_resend(req) &&
+ !req->rq_wait_ctx) {
+ req->rq_status = -ENOTCONN;
+ ptlrpc_rqphase_move(req,
+ RQ_PHASE_INTERPRET);
+ spin_unlock(&imp->imp_lock);
+ goto interpret;
+ }
+
+ list_del_init(&req->rq_list);
+ list_add_tail(&req->rq_list,
+ &imp->imp_sending_list);
+
+ spin_unlock(&imp->imp_lock);
+
+ spin_lock(&req->rq_lock);
+ req->rq_waiting = 0;
+ spin_unlock(&req->rq_lock);
+
+ if (req->rq_timedout || req->rq_resend) {
+ /* This is re-sending anyways,
+ * let's mark req as resend. */
+ spin_lock(&req->rq_lock);
+ req->rq_resend = 1;
+ spin_unlock(&req->rq_lock);
+ if (req->rq_bulk) {
+ __u64 old_xid;
+
+ if (!ptlrpc_unregister_bulk(req, 1))
+ continue;
+
+ /* ensure previous bulk fails */
+ old_xid = req->rq_xid;
+ req->rq_xid = ptlrpc_next_xid();
+ CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
+ old_xid, req->rq_xid);
+ }
+ }
+ /*
+ * rq_wait_ctx is only touched by ptlrpcd,
+ * so no lock is needed here.
+ */
+ status = sptlrpc_req_refresh_ctx(req, -1);
+ if (status) {
+ if (req->rq_err) {
+ req->rq_status = status;
+ spin_lock(&req->rq_lock);
+ req->rq_wait_ctx = 0;
+ spin_unlock(&req->rq_lock);
+ force_timer_recalc = 1;
+ } else {
+ spin_lock(&req->rq_lock);
+ req->rq_wait_ctx = 1;
+ spin_unlock(&req->rq_lock);
+ }
+
+ continue;
+ } else {
+ spin_lock(&req->rq_lock);
+ req->rq_wait_ctx = 0;
+ spin_unlock(&req->rq_lock);
+ }
+
+ rc = ptl_send_rpc(req, 0);
+ if (rc) {
+ DEBUG_REQ(D_HA, req,
+ "send failed: rc = %d", rc);
+ force_timer_recalc = 1;
+ spin_lock(&req->rq_lock);
+ req->rq_net_err = 1;
+ spin_unlock(&req->rq_lock);
+ continue;
+ }
+ /* need to reset the timeout */
+ force_timer_recalc = 1;
+ }
+
+ spin_lock(&req->rq_lock);
+
+ if (ptlrpc_client_early(req)) {
+ ptlrpc_at_recv_early_reply(req);
+ spin_unlock(&req->rq_lock);
+ continue;
+ }
+
+ /* Still waiting for a reply? */
+ if (ptlrpc_client_recv(req)) {
+ spin_unlock(&req->rq_lock);
+ continue;
+ }
+
+ /* Did we actually receive a reply? */
+ if (!ptlrpc_client_replied(req)) {
+ spin_unlock(&req->rq_lock);
+ continue;
+ }
+
+ spin_unlock(&req->rq_lock);
+
+ /* unlink from net because we are going to
+ * swab in-place of reply buffer */
+ unregistered = ptlrpc_unregister_reply(req, 1);
+ if (!unregistered)
+ continue;
+
+ req->rq_status = after_reply(req);
+ if (req->rq_resend)
+ continue;
+
+ /* If there is no bulk associated with this request,
+ * then we're done and should let the interpreter
+ * process the reply. Similarly if the RPC returned
+ * an error, and therefore the bulk will never arrive.
+ */
+ if (req->rq_bulk == NULL || req->rq_status < 0) {
+ ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+ goto interpret;
+ }
+
+ ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+ }
+
+ LASSERT(req->rq_phase == RQ_PHASE_BULK);
+ if (ptlrpc_client_bulk_active(req))
+ continue;
+
+ if (req->rq_bulk->bd_failure) {
+ /* The RPC reply arrived OK, but the bulk screwed
+ * up! Dead weird since the server told us the RPC
+ * was good after getting the REPLY for her GET or
+ * the ACK for her PUT. */
+ DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+ req->rq_status = -EIO;
+ }
+
+ ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+interpret:
+ LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+ /* This moves to "unregistering" phase we need to wait for
+ * reply unlink. */
+ if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
+ /* start async bulk unlink too */
+ ptlrpc_unregister_bulk(req, 1);
+ continue;
+ }
+
+ if (!ptlrpc_unregister_bulk(req, 1))
+ continue;
+
+ /* When calling interpret receiving already should be
+ * finished. */
+ LASSERT(!req->rq_receiving_reply);
+
+ ptlrpc_req_interpret(env, req, req->rq_status);
+
+ if (ptlrpcd_check_work(req)) {
+ atomic_dec(&set->set_remaining);
+ continue;
+ }
+ ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+ CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
+ "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
+ current_comm(), imp->imp_obd->obd_uuid.uuid,
+ lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+ libcfs_nid2str(imp->imp_connection->c_peer.nid),
+ lustre_msg_get_opc(req->rq_reqmsg));
+
+ spin_lock(&imp->imp_lock);
+ /* Request already may be not on sending or delaying list. This
+ * may happen in the case of marking it erroneous for the case
+ * ptlrpc_import_delay_req(req, status) find it impossible to
+ * allow sending this rpc and returns *status != 0. */
+ if (!list_empty(&req->rq_list)) {
+ list_del_init(&req->rq_list);
+ atomic_dec(&imp->imp_inflight);
+ }
+ spin_unlock(&imp->imp_lock);
+
+ atomic_dec(&set->set_remaining);
+ wake_up_all(&imp->imp_recovery_waitq);
+
+ if (set->set_producer) {
+ /* produce a new request if possible */
+ if (ptlrpc_set_producer(set) > 0)
+ force_timer_recalc = 1;
+
+ /* free the request that has just been completed
+ * in order not to pollute set->set_requests */
+ list_del_init(&req->rq_set_chain);
+ spin_lock(&req->rq_lock);
+ req->rq_set = NULL;
+ req->rq_invalid_rqset = 0;
+ spin_unlock(&req->rq_lock);
+
+ /* record rq_status to compute the final status later */
+ if (req->rq_status != 0)
+ set->set_rc = req->rq_status;
+ ptlrpc_req_finished(req);
+ } else {
+ list_move_tail(&req->rq_set_chain, &comp_reqs);
+ }
+ }
+
+ /* move completed request at the head of list so it's easier for
+ * caller to find them */
+ list_splice(&comp_reqs, &set->set_requests);
+
+ /* If we hit an error, we want to recover promptly. */
+ return atomic_read(&set->set_remaining) == 0 || force_timer_recalc;
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+ struct obd_import *imp = req->rq_import;
+ int rc = 0;
+
+ spin_lock(&req->rq_lock);
+ req->rq_timedout = 1;
+ spin_unlock(&req->rq_lock);
+
+ DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
+ "/real "CFS_DURATION_T"]",
+ req->rq_net_err ? "failed due to network error" :
+ ((req->rq_real_sent == 0 ||
+ time_before((unsigned long)req->rq_real_sent, (unsigned long)req->rq_sent) ||
+ cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
+ "timed out for sent delay" : "timed out for slow reply"),
+ req->rq_sent, req->rq_real_sent);
+
+ if (imp != NULL && obd_debug_peer_on_timeout)
+ LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+
+ ptlrpc_unregister_reply(req, async_unlink);
+ ptlrpc_unregister_bulk(req, async_unlink);
+
+ if (obd_dump_on_timeout)
+ libcfs_debug_dumplog();
+
+ if (imp == NULL) {
+ DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+ return 1;
+ }
+
+ atomic_inc(&imp->imp_timeouts);
+
+ /* The DLM server doesn't want recovery run on its imports. */
+ if (imp->imp_dlm_fake)
+ return 1;
+
+ /* If this request is for recovery or other primordial tasks,
+ * then error it out here. */
+ if (req->rq_ctx_init || req->rq_ctx_fini ||
+ req->rq_send_state != LUSTRE_IMP_FULL ||
+ imp->imp_obd->obd_no_recov) {
+ DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+ ptlrpc_import_state_name(req->rq_send_state),
+ ptlrpc_import_state_name(imp->imp_state));
+ spin_lock(&req->rq_lock);
+ req->rq_status = -ETIMEDOUT;
+ req->rq_err = 1;
+ spin_unlock(&req->rq_lock);
+ return 1;
+ }
+
+ /* if a request can't be resent we can't wait for an answer after
+ the timeout */
+ if (ptlrpc_no_resend(req)) {
+ DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+ rc = 1;
+ }
+
+ ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+ return rc;
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * Callback used when waiting on sets with l_wait_event.
+ * Always returns 1.
+ */
+int ptlrpc_expired_set(void *data)
+{
+ struct ptlrpc_request_set *set = data;
+ struct list_head *tmp;
+ time_t now = get_seconds();
+
+ LASSERT(set != NULL);
+
+ /*
+ * A timeout expired. See which reqs it applies to...
+ */
+ list_for_each(tmp, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request,
+ rq_set_chain);
+
+ /* don't expire request waiting for context */
+ if (req->rq_wait_ctx)
+ continue;
+
+ /* Request in-flight? */
+ if (!((req->rq_phase == RQ_PHASE_RPC &&
+ !req->rq_waiting && !req->rq_resend) ||
+ (req->rq_phase == RQ_PHASE_BULK)))
+ continue;
+
+ if (req->rq_timedout || /* already dealt with */
+ req->rq_deadline > now) /* not expired */
+ continue;
+
+ /* Deal with this guy. Do it asynchronously to not block
+ * ptlrpcd thread. */
+ ptlrpc_expire_one_request(req, 1);
+ }
+
+ /*
+ * When waiting for a whole set, we always break out of the
+ * sleep so we can recalculate the timeout, or enable interrupts
+ * if everyone's timed out.
+ */
+ return 1;
+}
+EXPORT_SYMBOL(ptlrpc_expired_set);
+
+/**
+ * Sets rq_intr flag in \a req under spinlock.
+ */
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
+{
+ spin_lock(&req->rq_lock);
+ req->rq_intr = 1;
+ spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_mark_interrupted);
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. Callback for l_wait_event for interruptible waits.
+ */
+void ptlrpc_interrupted_set(void *data)
+{
+ struct ptlrpc_request_set *set = data;
+ struct list_head *tmp;
+
+ LASSERT(set != NULL);
+ CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+ list_for_each(tmp, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request,
+ rq_set_chain);
+
+ if (req->rq_phase != RQ_PHASE_RPC &&
+ req->rq_phase != RQ_PHASE_UNREGISTERING)
+ continue;
+
+ ptlrpc_mark_interrupted(req);
+ }
+}
+EXPORT_SYMBOL(ptlrpc_interrupted_set);
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+ struct list_head *tmp;
+ time_t now = get_seconds();
+ int timeout = 0;
+ struct ptlrpc_request *req;
+ int deadline;
+
+ list_for_each(tmp, &set->set_requests) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+ /*
+ * Request in-flight?
+ */
+ if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+ (req->rq_phase == RQ_PHASE_BULK) ||
+ (req->rq_phase == RQ_PHASE_NEW)))
+ continue;
+
+ /*
+ * Already timed out.
+ */
+ if (req->rq_timedout)
+ continue;
+
+ /*
+ * Waiting for ctx.
+ */
+ if (req->rq_wait_ctx)
+ continue;
+
+ if (req->rq_phase == RQ_PHASE_NEW)
+ deadline = req->rq_sent;
+ else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+ deadline = req->rq_sent;
+ else
+ deadline = req->rq_sent + req->rq_timeout;
+
+ if (deadline <= now) /* actually expired already */
+ timeout = 1; /* ASAP */
+ else if (timeout == 0 || timeout > deadline - now)
+ timeout = deadline - now;
+ }
+ return timeout;
+}
+EXPORT_SYMBOL(ptlrpc_set_next_timeout);
+
+/**
+ * Send all unset request from the set and then wait until all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+ struct list_head *tmp;
+ struct ptlrpc_request *req;
+ struct l_wait_info lwi;
+ int rc, timeout;
+
+ if (set->set_producer)
+ (void)ptlrpc_set_producer(set);
+ else
+ list_for_each(tmp, &set->set_requests) {
+ req = list_entry(tmp, struct ptlrpc_request,
+ rq_set_chain);
+ if (req->rq_phase == RQ_PHASE_NEW)
+ (void)ptlrpc_send_new_req(req);
+ }
+
+ if (list_empty(&set->set_requests))
+ return 0;
+
+ do {
+ timeout = ptlrpc_set_next_timeout(set);
+
+ /* wait until all complete, interrupted, or an in-flight
+ * req times out */
+ CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+ set, timeout);
+
+ if (timeout == 0 && !cfs_signal_pending())
+ /*
+ * No requests are in-flight (ether timed out
+ * or delayed), so we can allow interrupts.
+ * We still want to block for a limited time,
+ * so we allow interrupts during the timeout.
+ */
+ lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
+ ptlrpc_expired_set,
+ ptlrpc_interrupted_set, set);
+ else
+ /*
+ * At least one request is in flight, so no
+ * interrupts are allowed. Wait until all
+ * complete, or an in-flight req times out.
+ */
+ lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+ ptlrpc_expired_set, set);
+
+ rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+
+ /* LU-769 - if we ignored the signal because it was already
+ * pending when we started, we need to handle it now or we risk
+ * it being ignored forever */
+ if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
+ cfs_signal_pending()) {
+ sigset_t blocked_sigs =
+ cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+ /* In fact we only interrupt for the "fatal" signals
+ * like SIGINT or SIGKILL. We still ignore less
+ * important signals since ptlrpc set is not easily
+ * reentrant from userspace again */
+ if (cfs_signal_pending())
+ ptlrpc_interrupted_set(set);
+ cfs_restore_sigs(blocked_sigs);
+ }
+
+ LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+ /* -EINTR => all requests have been flagged rq_intr so next
+ * check completes.
+ * -ETIMEDOUT => someone timed out. When all reqs have
+ * timed out, signals are enabled allowing completion with
+ * EINTR.
+ * I don't really care if we go once more round the loop in
+ * the error cases -eeb. */
+ if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+ list_for_each(tmp, &set->set_requests) {
+ req = list_entry(tmp, struct ptlrpc_request,
+ rq_set_chain);
+ spin_lock(&req->rq_lock);
+ req->rq_invalid_rqset = 1;
+ spin_unlock(&req->rq_lock);
+ }
+ }
+ } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+ LASSERT(atomic_read(&set->set_remaining) == 0);
+
+ rc = set->set_rc; /* rq_status of already freed requests if any */
+ list_for_each(tmp, &set->set_requests) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+ LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+ if (req->rq_status != 0)
+ rc = req->rq_status;
+ }
+
+ if (set->set_interpret != NULL) {
+ int (*interpreter)(struct ptlrpc_request_set *set, void *, int) =
+ set->set_interpret;
+ rc = interpreter(set, set->set_arg, rc);
+ } else {
+ struct ptlrpc_set_cbdata *cbdata, *n;
+ int err;
+
+ list_for_each_entry_safe(cbdata, n,
+ &set->set_cblist, psc_item) {
+ list_del_init(&cbdata->psc_item);
+ err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+ if (err && !rc)
+ rc = err;
+ OBD_FREE_PTR(cbdata);
+ }
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper function for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+ if (request == NULL)
+ return;
+ LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+ LASSERTF(request->rq_rqbd == NULL, "req %p\n", request);/* client-side */
+ LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+ LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+ LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
+ LASSERTF(!request->rq_replay, "req %p\n", request);
+
+ req_capsule_fini(&request->rq_pill);
+
+ /* We must take it off the imp_replay_list first. Otherwise, we'll set
+ * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+ if (request->rq_import != NULL) {
+ if (!locked)
+ spin_lock(&request->rq_import->imp_lock);
+ list_del_init(&request->rq_replay_list);
+ if (!locked)
+ spin_unlock(&request->rq_import->imp_lock);
+ }
+ LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+ if (atomic_read(&request->rq_refcount) != 0) {
+ DEBUG_REQ(D_ERROR, request,
+ "freeing request with nonzero refcount");
+ LBUG();
+ }
+
+ if (request->rq_repbuf != NULL)
+ sptlrpc_cli_free_repbuf(request);
+ if (request->rq_export != NULL) {
+ class_export_put(request->rq_export);
+ request->rq_export = NULL;
+ }
+ if (request->rq_import != NULL) {
+ class_import_put(request->rq_import);
+ request->rq_import = NULL;
+ }
+ if (request->rq_bulk != NULL)
+ ptlrpc_free_bulk_pin(request->rq_bulk);
+
+ if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
+ sptlrpc_cli_free_reqbuf(request);
+
+ if (request->rq_cli_ctx)
+ sptlrpc_req_put_ctx(request, !locked);
+
+ if (request->rq_pool)
+ __ptlrpc_free_req_to_pool(request);
+ else
+ ptlrpc_request_cache_free(request);
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, request is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+ assert_spin_locked(&request->rq_import->imp_lock);
+ (void)__ptlrpc_req_finished(request, 1);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request when reference count reaches zero.
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+ if (request == NULL)
+ return 1;
+
+ if (request == LP_POISON ||
+ request->rq_reqmsg == LP_POISON) {
+ CERROR("dereferencing freed request (bug 575)\n");
+ LBUG();
+ return 1;
+ }
+
+ DEBUG_REQ(D_INFO, request, "refcount now %u",
+ atomic_read(&request->rq_refcount) - 1);
+
+ if (atomic_dec_and_test(&request->rq_refcount)) {
+ __ptlrpc_free_req(request, locked);
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+ __ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+ return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+ int rc;
+ wait_queue_head_t *wq;
+ struct l_wait_info lwi;
+
+ /*
+ * Might sleep.
+ */
+ LASSERT(!in_interrupt());
+
+ /*
+ * Let's setup deadline for reply unlink.
+ */
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+ async && request->rq_reply_deadline == 0)
+ request->rq_reply_deadline = get_seconds()+LONG_UNLINK;
+
+ /*
+ * Nothing left to do.
+ */
+ if (!ptlrpc_client_recv_or_unlink(request))
+ return 1;
+
+ LNetMDUnlink(request->rq_reply_md_h);
+
+ /*
+ * Let's check it once again.
+ */
+ if (!ptlrpc_client_recv_or_unlink(request))
+ return 1;
+
+ /*
+ * Move to "Unregistering" phase as reply was not unlinked yet.
+ */
+ ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+
+ /*
+ * Do not wait for unlink to finish.
+ */
+ if (async)
+ return 0;
+
+ /*
+ * We have to l_wait_event() whatever the result, to give liblustre
+ * a chance to run reply_in_callback(), and to make sure we've
+ * unlinked before returning a req to the pool.
+ */
+ if (request->rq_set != NULL)
+ wq = &request->rq_set->set_waitq;
+ else
+ wq = &request->rq_reply_waitq;
+
+ for (;;) {
+ /* Network access will complete in finite time but the HUGE
+ * timeout lets us CWARN for visibility of sluggish NALs */
+ lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+ cfs_time_seconds(1), NULL, NULL);
+ rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+ &lwi);
+ if (rc == 0) {
+ ptlrpc_rqphase_move(request, request->rq_next_phase);
+ return 1;
+ }
+
+ LASSERT(rc == -ETIMEDOUT);
+ DEBUG_REQ(D_WARNING, request,
+ "Unexpectedly long timeout rvcng=%d unlnk=%d/%d",
+ request->rq_receiving_reply,
+ request->rq_req_unlink, request->rq_reply_unlink);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_unregister_reply);
+
+static void ptlrpc_free_request(struct ptlrpc_request *req)
+{
+ spin_lock(&req->rq_lock);
+ req->rq_replay = 0;
+ spin_unlock(&req->rq_lock);
+
+ if (req->rq_commit_cb != NULL)
+ req->rq_commit_cb(req);
+ list_del_init(&req->rq_replay_list);
+
+ __ptlrpc_req_finished(req, 1);
+}
+
+/**
+ * the request is committed and dropped from the replay list of its import
+ */
+void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
+{
+ struct obd_import *imp = req->rq_import;
+
+ spin_lock(&imp->imp_lock);
+ if (list_empty(&req->rq_replay_list)) {
+ spin_unlock(&imp->imp_lock);
+ return;
+ }
+
+ if (force || req->rq_transno <= imp->imp_peer_committed_transno)
+ ptlrpc_free_request(req);
+
+ spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_request_committed);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meeting first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+ struct ptlrpc_request *req, *saved;
+ struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+ bool skip_committed_list = true;
+
+ LASSERT(imp != NULL);
+ assert_spin_locked(&imp->imp_lock);
+
+ if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+ imp->imp_generation == imp->imp_last_generation_checked) {
+ CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
+ imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+ return;
+ }
+ CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
+ imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+ imp->imp_generation);
+
+ if (imp->imp_generation != imp->imp_last_generation_checked)
+ skip_committed_list = false;
+
+ imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+ imp->imp_last_generation_checked = imp->imp_generation;
+
+ list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
+ rq_replay_list) {
+ /* XXX ok to remove when 1357 resolved - rread 05/29/03 */
+ LASSERT(req != last_req);
+ last_req = req;
+
+ if (req->rq_transno == 0) {
+ DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+ LBUG();
+ }
+ if (req->rq_import_generation < imp->imp_generation) {
+ DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+ goto free_req;
+ }
+
+ /* not yet committed */
+ if (req->rq_transno > imp->imp_peer_committed_transno) {
+ DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+ break;
+ }
+
+ if (req->rq_replay) {
+ DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+ list_move_tail(&req->rq_replay_list,
+ &imp->imp_committed_list);
+ continue;
+ }
+
+ DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
+ imp->imp_peer_committed_transno);
+free_req:
+ ptlrpc_free_request(req);
+ }
+ if (skip_committed_list)
+ return;
+
+ list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
+ rq_replay_list) {
+ LASSERT(req->rq_transno != 0);
+ if (req->rq_import_generation < imp->imp_generation) {
+ DEBUG_REQ(D_RPCTRACE, req, "free stale open request");
+ ptlrpc_free_request(req);
+ }
+ }
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_client);
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+ DEBUG_REQ(D_HA, req, "going to resend");
+ spin_lock(&req->rq_lock);
+
+ /* Request got reply but linked to the import list still.
+ Let ptlrpc_check_set() to process it. */
+ if (ptlrpc_client_replied(req)) {
+ spin_unlock(&req->rq_lock);
+ DEBUG_REQ(D_HA, req, "it has reply, so skip it");
+ return;
+ }
+
+ lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+ req->rq_status = -EAGAIN;
+
+ req->rq_resend = 1;
+ req->rq_net_err = 0;
+ req->rq_timedout = 0;
+ if (req->rq_bulk) {
+ __u64 old_xid = req->rq_xid;
+
+ /* ensure previous bulk fails */
+ req->rq_xid = ptlrpc_next_xid();
+ CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
+ old_xid, req->rq_xid);
+ }
+ ptlrpc_client_wake_req(req);
+ spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_resend_req);
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+ DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+ req->rq_status = -ERESTARTSYS;
+
+ spin_lock(&req->rq_lock);
+ req->rq_restart = 1;
+ req->rq_timedout = 0;
+ ptlrpc_client_wake_req(req);
+ spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_restart_req);
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+ atomic_inc(&req->rq_refcount);
+ return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+ struct obd_import *imp)
+{
+ struct list_head *tmp;
+
+ assert_spin_locked(&imp->imp_lock);
+
+ if (req->rq_transno == 0) {
+ DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+ LBUG();
+ }
+
+ /* clear this for new requests that were resent as well
+ as resent replayed requests. */
+ lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+ /* don't re-add requests that have been replayed */
+ if (!list_empty(&req->rq_replay_list))
+ return;
+
+ lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+ LASSERT(imp->imp_replayable);
+ /* Balanced in ptlrpc_free_committed, usually. */
+ ptlrpc_request_addref(req);
+ list_for_each_prev(tmp, &imp->imp_replay_list) {
+ struct ptlrpc_request *iter =
+ list_entry(tmp, struct ptlrpc_request,
+ rq_replay_list);
+
+ /* We may have duplicate transnos if we create and then
+ * open a file, or for closes retained if to match creating
+ * opens, so use req->rq_xid as a secondary key.
+ * (See bugs 684, 685, and 428.)
+ * XXX no longer needed, but all opens need transnos!
+ */
+ if (iter->rq_transno > req->rq_transno)
+ continue;
+
+ if (iter->rq_transno == req->rq_transno) {
+ LASSERT(iter->rq_xid != req->rq_xid);
+ if (iter->rq_xid > req->rq_xid)
+ continue;
+ }
+
+ list_add(&req->rq_replay_list, &iter->rq_replay_list);
+ return;
+ }
+
+ list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+ struct ptlrpc_request_set *set;
+ int rc;
+
+ LASSERT(req->rq_set == NULL);
+ LASSERT(!req->rq_receiving_reply);
+
+ set = ptlrpc_prep_set();
+ if (set == NULL) {
+ CERROR("Unable to allocate ptlrpc set.");
+ return -ENOMEM;
+ }
+
+ /* for distributed debugging */
+ lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+ /* add a ref for the set (see comment in ptlrpc_set_add_req) */
+ ptlrpc_request_addref(req);
+ ptlrpc_set_add_req(set, req);
+ rc = ptlrpc_set_wait(set);
+ ptlrpc_set_destroy(set);
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+struct ptlrpc_replay_async_args {
+ int praa_old_state;
+ int praa_old_status;
+};
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of successful reply calls registered request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *data, int rc)
+{
+ struct ptlrpc_replay_async_args *aa = data;
+ struct obd_import *imp = req->rq_import;
+
+ atomic_dec(&imp->imp_replay_inflight);
+
+ if (!ptlrpc_client_replied(req)) {
+ CERROR("request replay timed out, restarting recovery\n");
+ rc = -ETIMEDOUT;
+ goto out;
+ }
+
+ if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+ (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+ lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) {
+ rc = lustre_msg_get_status(req->rq_repmsg);
+ goto out;
+ }
+
+ /** VBR: check version failure */
+ if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+ /** replay was failed due to version mismatch */
+ DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+ spin_lock(&imp->imp_lock);
+ imp->imp_vbr_failed = 1;
+ imp->imp_no_lock_replay = 1;
+ spin_unlock(&imp->imp_lock);
+ lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+ } else {
+ /** The transno had better not change over replay. */
+ LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+ lustre_msg_get_transno(req->rq_repmsg) ||
+ lustre_msg_get_transno(req->rq_repmsg) == 0,
+ "%#llx/%#llx\n",
+ lustre_msg_get_transno(req->rq_reqmsg),
+ lustre_msg_get_transno(req->rq_repmsg));
+ }
+
+ spin_lock(&imp->imp_lock);
+ /** if replays by version then gap occur on server, no trust to locks */
+ if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+ imp->imp_no_lock_replay = 1;
+ imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+ spin_unlock(&imp->imp_lock);
+ LASSERT(imp->imp_last_replay_transno);
+
+ /* transaction number shouldn't be bigger than the latest replayed */
+ if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+ DEBUG_REQ(D_ERROR, req,
+ "Reported transno %llu is bigger than the replayed one: %llu",
+ req->rq_transno,
+ lustre_msg_get_transno(req->rq_reqmsg));
+ rc = -EINVAL;
+ goto out;
+ }
+
+ DEBUG_REQ(D_HA, req, "got rep");
+
+ /* let the callback do fixups, possibly including in the request */
+ if (req->rq_replay_cb)
+ req->rq_replay_cb(req);
+
+ if (ptlrpc_client_replied(req) &&
+ lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+ DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+ lustre_msg_get_status(req->rq_repmsg),
+ aa->praa_old_status);
+ } else {
+ /* Put it back for re-replay. */
+ lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+ }
+
+ /*
+ * Errors while replay can set transno to 0, but
+ * imp_last_replay_transno shouldn't be set to 0 anyway
+ */
+ if (req->rq_transno == 0)
+ CERROR("Transno is 0 during replay!\n");
+
+ /* continue with recovery */
+ rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+ req->rq_send_state = aa->praa_old_state;
+
+ if (rc != 0)
+ /* this replay failed, so restart recovery */
+ ptlrpc_connect_import(imp);
+
+ return rc;
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+ struct ptlrpc_replay_async_args *aa;
+
+ LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+ LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ memset(aa, 0, sizeof(*aa));
+
+ /* Prepare request to be resent with ptlrpcd */
+ aa->praa_old_state = req->rq_send_state;
+ req->rq_send_state = LUSTRE_IMP_REPLAY;
+ req->rq_phase = RQ_PHASE_NEW;
+ req->rq_next_phase = RQ_PHASE_UNDEFINED;
+ if (req->rq_repmsg)
+ aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+ req->rq_status = 0;
+ req->rq_interpret_reply = ptlrpc_replay_interpret;
+ /* Readjust the timeout for current conditions */
+ ptlrpc_at_set_req_timeout(req);
+
+ /* Tell server the net_latency, so the server can calculate how long
+ * it should wait for next replay */
+ lustre_msg_set_service_time(req->rq_reqmsg,
+ ptlrpc_at_get_net_latency(req));
+ DEBUG_REQ(D_HA, req, "REPLAY");
+
+ atomic_inc(&req->rq_import->imp_replay_inflight);
+ ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
+
+ ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_replay_req);
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+ struct list_head *tmp, *n;
+
+ /* Make sure that no new requests get processed for this import.
+ * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+ * this flag and then putting requests on sending_list or delayed_list.
+ */
+ spin_lock(&imp->imp_lock);
+
+ /* XXX locking? Maybe we should remove each request with the list
+ * locked? Also, how do we know if the requests on the list are
+ * being freed at this time?
+ */
+ list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request, rq_list);
+
+ DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+ spin_lock(&req->rq_lock);
+ if (req->rq_import_generation < imp->imp_generation) {
+ req->rq_err = 1;
+ req->rq_status = -EIO;
+ ptlrpc_client_wake_req(req);
+ }
+ spin_unlock(&req->rq_lock);
+ }
+
+ list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request, rq_list);
+
+ DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+ spin_lock(&req->rq_lock);
+ if (req->rq_import_generation < imp->imp_generation) {
+ req->rq_err = 1;
+ req->rq_status = -EIO;
+ ptlrpc_client_wake_req(req);
+ }
+ spin_unlock(&req->rq_lock);
+ }
+
+ /* Last chance to free reqs left on the replay list, but we
+ * will still leak reqs that haven't committed. */
+ if (imp->imp_replayable)
+ ptlrpc_free_committed(imp);
+
+ spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+ struct list_head *tmp, *pos;
+
+ LASSERT(set != NULL);
+
+ list_for_each_safe(pos, tmp, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(pos, struct ptlrpc_request,
+ rq_set_chain);
+
+ spin_lock(&req->rq_lock);
+ if (req->rq_phase != RQ_PHASE_RPC) {
+ spin_unlock(&req->rq_lock);
+ continue;
+ }
+
+ req->rq_err = 1;
+ req->rq_status = -EINTR;
+ ptlrpc_client_wake_req(req);
+ spin_unlock(&req->rq_lock);
+ }
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/**
+ * Initialize the XID for the node. This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing. It does not need to be sequential. Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+ time_t now = get_seconds();
+
+ spin_lock_init(&ptlrpc_last_xid_lock);
+ if (now < YEAR_2004) {
+ cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+ ptlrpc_last_xid >>= 2;
+ ptlrpc_last_xid |= (1ULL << 61);
+ } else {
+ ptlrpc_last_xid = (__u64)now << 20;
+ }
+
+ /* Always need to be aligned to a power-of-two for multi-bulk BRW */
+ CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
+ ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask. The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+ __u64 next;
+
+ spin_lock(&ptlrpc_last_xid_lock);
+ next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+ ptlrpc_last_xid = next;
+ spin_unlock(&ptlrpc_last_xid_lock);
+
+ return next;
+}
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+#if BITS_PER_LONG == 32
+ /* need to avoid possible word tearing on 32-bit systems */
+ __u64 next;
+
+ spin_lock(&ptlrpc_last_xid_lock);
+ next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+ spin_unlock(&ptlrpc_last_xid_lock);
+
+ return next;
+#else
+ /* No need to lock, since returned value is racy anyways */
+ return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ * handler = ptlrpcd_alloc_work();
+ * ptlrpcd_queue_work();
+ *
+ * queue it again when necessary:
+ * ptlrpcd_queue_work();
+ * ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ * it will only be queued once in any time. Also as its name implies, it may
+ * have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+ int (*cb)(const struct lu_env *, void *);
+ void *cbdata;
+};
+
+static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
+{
+ /* re-initialize the req */
+ req->rq_timeout = obd_timeout;
+ req->rq_sent = get_seconds();
+ req->rq_deadline = req->rq_sent + req->rq_timeout;
+ req->rq_reply_deadline = req->rq_deadline;
+ req->rq_phase = RQ_PHASE_INTERPRET;
+ req->rq_next_phase = RQ_PHASE_COMPLETE;
+ req->rq_xid = ptlrpc_next_xid();
+ req->rq_import_generation = req->rq_import->imp_generation;
+
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+}
+
+static int work_interpreter(const struct lu_env *env,
+ struct ptlrpc_request *req, void *data, int rc)
+{
+ struct ptlrpc_work_async_args *arg = data;
+
+ LASSERT(ptlrpcd_check_work(req));
+ LASSERT(arg->cb != NULL);
+
+ rc = arg->cb(env, arg->cbdata);
+
+ list_del_init(&req->rq_set_chain);
+ req->rq_set = NULL;
+
+ if (atomic_dec_return(&req->rq_refcount) > 1) {
+ atomic_set(&req->rq_refcount, 2);
+ ptlrpcd_add_work_req(req);
+ }
+ return rc;
+}
+
+static int worker_format;
+
+static int ptlrpcd_check_work(struct ptlrpc_request *req)
+{
+ return req->rq_pill.rc_fmt == (void *)&worker_format;
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+ int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+ struct ptlrpc_request *req = NULL;
+ struct ptlrpc_work_async_args *args;
+
+ might_sleep();
+
+ if (cb == NULL)
+ return ERR_PTR(-EINVAL);
+
+ /* copy some code from deprecated fakereq. */
+ req = ptlrpc_request_cache_alloc(GFP_NOFS);
+ if (req == NULL) {
+ CERROR("ptlrpc: run out of memory!\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ req->rq_send_state = LUSTRE_IMP_FULL;
+ req->rq_type = PTL_RPC_MSG_REQUEST;
+ req->rq_import = class_import_get(imp);
+ req->rq_export = NULL;
+ req->rq_interpret_reply = work_interpreter;
+ /* don't want reply */
+ req->rq_receiving_reply = 0;
+ req->rq_req_unlink = req->rq_reply_unlink = 0;
+ req->rq_no_delay = req->rq_no_resend = 1;
+ req->rq_pill.rc_fmt = (void *)&worker_format;
+
+ spin_lock_init(&req->rq_lock);
+ INIT_LIST_HEAD(&req->rq_list);
+ INIT_LIST_HEAD(&req->rq_replay_list);
+ INIT_LIST_HEAD(&req->rq_set_chain);
+ INIT_LIST_HEAD(&req->rq_history_list);
+ INIT_LIST_HEAD(&req->rq_exp_list);
+ init_waitqueue_head(&req->rq_reply_waitq);
+ init_waitqueue_head(&req->rq_set_waitq);
+ atomic_set(&req->rq_refcount, 1);
+
+ CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
+ args = ptlrpc_req_async_args(req);
+ args->cb = cb;
+ args->cbdata = cbdata;
+
+ return req;
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+ struct ptlrpc_request *req = handler;
+
+ if (req)
+ ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+ struct ptlrpc_request *req = handler;
+
+ /*
+ * Check if the req is already being queued.
+ *
+ * Here comes a trick: it lacks a way of checking if a req is being
+ * processed reliably in ptlrpc. Here I have to use refcount of req
+ * for this purpose. This is okay because the caller should use this
+ * req as opaque data. - Jinshan
+ */
+ LASSERT(atomic_read(&req->rq_refcount) > 0);
+ if (atomic_inc_return(&req->rq_refcount) == 2)
+ ptlrpcd_add_work_req(req);
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c
new file mode 100644
index 000000000..7e27397ce
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/connection.c
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+
+#include "ptlrpc_internal.h"
+
+static struct cfs_hash *conn_hash;
+static cfs_hash_ops_t conn_hash_ops;
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
+ struct obd_uuid *uuid)
+{
+ struct ptlrpc_connection *conn, *conn2;
+
+ conn = cfs_hash_lookup(conn_hash, &peer);
+ if (conn)
+ goto out;
+
+ OBD_ALLOC_PTR(conn);
+ if (!conn)
+ return NULL;
+
+ conn->c_peer = peer;
+ conn->c_self = self;
+ INIT_HLIST_NODE(&conn->c_hash);
+ atomic_set(&conn->c_refcount, 1);
+ if (uuid)
+ obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+ /*
+ * Add the newly created conn to the hash, on key collision we
+ * lost a racing addition and must destroy our newly allocated
+ * connection. The object which exists in the has will be
+ * returned and may be compared against out object.
+ */
+ /* In the function below, .hs_keycmp resolves to
+ * conn_keycmp() */
+ /* coverity[overrun-buffer-val] */
+ conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+ if (conn != conn2) {
+ OBD_FREE_PTR(conn);
+ conn = conn2;
+ }
+out:
+ CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+ conn, atomic_read(&conn->c_refcount),
+ libcfs_nid2str(conn->c_peer.nid));
+ return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_get);
+
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+ int rc = 0;
+
+ if (!conn)
+ return rc;
+
+ LASSERT(atomic_read(&conn->c_refcount) > 1);
+
+ /*
+ * We do not remove connection from hashtable and
+ * do not free it even if last caller released ref,
+ * as we want to have it cached for the case it is
+ * needed again.
+ *
+ * Deallocating it and later creating new connection
+ * again would be wastful. This way we also avoid
+ * expensive locking to protect things from get/put
+ * race when found cached connection is freed by
+ * ptlrpc_connection_put().
+ *
+ * It will be freed later in module unload time,
+ * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+ * path is called.
+ */
+ if (atomic_dec_return(&conn->c_refcount) == 1)
+ rc = 1;
+
+ CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+ conn, atomic_read(&conn->c_refcount),
+ libcfs_nid2str(conn->c_peer.nid));
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_connection_put);
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+ atomic_inc(&conn->c_refcount);
+ CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+ conn, atomic_read(&conn->c_refcount),
+ libcfs_nid2str(conn->c_peer.nid));
+
+ return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_addref);
+
+int ptlrpc_connection_init(void)
+{
+ conn_hash = cfs_hash_create("CONN_HASH",
+ HASH_CONN_CUR_BITS,
+ HASH_CONN_MAX_BITS,
+ HASH_CONN_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &conn_hash_ops, CFS_HASH_DEFAULT);
+ if (!conn_hash)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_connection_init);
+
+void ptlrpc_connection_fini(void)
+{
+ cfs_hash_putref(conn_hash);
+}
+EXPORT_SYMBOL(ptlrpc_connection_fini);
+
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(struct cfs_hash *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask);
+}
+
+static int
+conn_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct ptlrpc_connection *conn;
+ const lnet_process_id_t *conn_key;
+
+ LASSERT(key != NULL);
+ conn_key = (lnet_process_id_t *)key;
+ conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+
+ return conn_key->nid == conn->c_peer.nid &&
+ conn_key->pid == conn->c_peer.pid;
+}
+
+static void *
+conn_key(struct hlist_node *hnode)
+{
+ struct ptlrpc_connection *conn;
+
+ conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+ return &conn->c_peer;
+}
+
+static void *
+conn_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+}
+
+static void
+conn_get(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ptlrpc_connection *conn;
+
+ conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+ atomic_inc(&conn->c_refcount);
+}
+
+static void
+conn_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ptlrpc_connection *conn;
+
+ conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+ atomic_dec(&conn->c_refcount);
+}
+
+static void
+conn_exit(struct cfs_hash *hs, struct hlist_node *hnode)
+{
+ struct ptlrpc_connection *conn;
+
+ conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+ /*
+ * Nothing should be left. Connection user put it and
+ * connection also was deleted from table by this time
+ * so we should have 0 refs.
+ */
+ LASSERTF(atomic_read(&conn->c_refcount) == 0,
+ "Busy connection with %d refs\n",
+ atomic_read(&conn->c_refcount));
+ OBD_FREE_PTR(conn);
+}
+
+static cfs_hash_ops_t conn_hash_ops = {
+ .hs_hash = conn_hashfn,
+ .hs_keycmp = conn_keycmp,
+ .hs_key = conn_key,
+ .hs_object = conn_object,
+ .hs_get = conn_get,
+ .hs_put_locked = conn_put_locked,
+ .hs_exit = conn_exit,
+};
diff --git a/drivers/staging/lustre/lustre/ptlrpc/errno.c b/drivers/staging/lustre/lustre/ptlrpc/errno.c
new file mode 100644
index 000000000..73f8374f1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/errno.c
@@ -0,0 +1,380 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include "../include/lustre/lustre_errno.h"
+
+/*
+ * The two translation tables below must define a one-to-one mapping between
+ * host and network errnos.
+ *
+ * EWOULDBLOCK is equal to EAGAIN on all architectures except for parisc, which
+ * appears irrelevant. Thus, existing references to EWOULDBLOCK are fine.
+ *
+ * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least. A sparc
+ * host has no context-free way to determine if a LUSTRE_EDEADLK represents an
+ * EDEADLK or an EDEADLOCK. Therefore, all existing references to EDEADLOCK
+ * that need to be transferred on wire have been replaced with EDEADLK.
+ */
+static int lustre_errno_hton_mapping[] = {
+ [EPERM] = LUSTRE_EPERM,
+ [ENOENT] = LUSTRE_ENOENT,
+ [ESRCH] = LUSTRE_ESRCH,
+ [EINTR] = LUSTRE_EINTR,
+ [EIO] = LUSTRE_EIO,
+ [ENXIO] = LUSTRE_ENXIO,
+ [E2BIG] = LUSTRE_E2BIG,
+ [ENOEXEC] = LUSTRE_ENOEXEC,
+ [EBADF] = LUSTRE_EBADF,
+ [ECHILD] = LUSTRE_ECHILD,
+ [EAGAIN] = LUSTRE_EAGAIN,
+ [ENOMEM] = LUSTRE_ENOMEM,
+ [EACCES] = LUSTRE_EACCES,
+ [EFAULT] = LUSTRE_EFAULT,
+ [ENOTBLK] = LUSTRE_ENOTBLK,
+ [EBUSY] = LUSTRE_EBUSY,
+ [EEXIST] = LUSTRE_EEXIST,
+ [EXDEV] = LUSTRE_EXDEV,
+ [ENODEV] = LUSTRE_ENODEV,
+ [ENOTDIR] = LUSTRE_ENOTDIR,
+ [EISDIR] = LUSTRE_EISDIR,
+ [EINVAL] = LUSTRE_EINVAL,
+ [ENFILE] = LUSTRE_ENFILE,
+ [EMFILE] = LUSTRE_EMFILE,
+ [ENOTTY] = LUSTRE_ENOTTY,
+ [ETXTBSY] = LUSTRE_ETXTBSY,
+ [EFBIG] = LUSTRE_EFBIG,
+ [ENOSPC] = LUSTRE_ENOSPC,
+ [ESPIPE] = LUSTRE_ESPIPE,
+ [EROFS] = LUSTRE_EROFS,
+ [EMLINK] = LUSTRE_EMLINK,
+ [EPIPE] = LUSTRE_EPIPE,
+ [EDOM] = LUSTRE_EDOM,
+ [ERANGE] = LUSTRE_ERANGE,
+ [EDEADLK] = LUSTRE_EDEADLK,
+ [ENAMETOOLONG] = LUSTRE_ENAMETOOLONG,
+ [ENOLCK] = LUSTRE_ENOLCK,
+ [ENOSYS] = LUSTRE_ENOSYS,
+ [ENOTEMPTY] = LUSTRE_ENOTEMPTY,
+ [ELOOP] = LUSTRE_ELOOP,
+ [ENOMSG] = LUSTRE_ENOMSG,
+ [EIDRM] = LUSTRE_EIDRM,
+ [ECHRNG] = LUSTRE_ECHRNG,
+ [EL2NSYNC] = LUSTRE_EL2NSYNC,
+ [EL3HLT] = LUSTRE_EL3HLT,
+ [EL3RST] = LUSTRE_EL3RST,
+ [ELNRNG] = LUSTRE_ELNRNG,
+ [EUNATCH] = LUSTRE_EUNATCH,
+ [ENOCSI] = LUSTRE_ENOCSI,
+ [EL2HLT] = LUSTRE_EL2HLT,
+ [EBADE] = LUSTRE_EBADE,
+ [EBADR] = LUSTRE_EBADR,
+ [EXFULL] = LUSTRE_EXFULL,
+ [ENOANO] = LUSTRE_ENOANO,
+ [EBADRQC] = LUSTRE_EBADRQC,
+ [EBADSLT] = LUSTRE_EBADSLT,
+ [EBFONT] = LUSTRE_EBFONT,
+ [ENOSTR] = LUSTRE_ENOSTR,
+ [ENODATA] = LUSTRE_ENODATA,
+ [ETIME] = LUSTRE_ETIME,
+ [ENOSR] = LUSTRE_ENOSR,
+ [ENONET] = LUSTRE_ENONET,
+ [ENOPKG] = LUSTRE_ENOPKG,
+ [EREMOTE] = LUSTRE_EREMOTE,
+ [ENOLINK] = LUSTRE_ENOLINK,
+ [EADV] = LUSTRE_EADV,
+ [ESRMNT] = LUSTRE_ESRMNT,
+ [ECOMM] = LUSTRE_ECOMM,
+ [EPROTO] = LUSTRE_EPROTO,
+ [EMULTIHOP] = LUSTRE_EMULTIHOP,
+ [EDOTDOT] = LUSTRE_EDOTDOT,
+ [EBADMSG] = LUSTRE_EBADMSG,
+ [EOVERFLOW] = LUSTRE_EOVERFLOW,
+ [ENOTUNIQ] = LUSTRE_ENOTUNIQ,
+ [EBADFD] = LUSTRE_EBADFD,
+ [EREMCHG] = LUSTRE_EREMCHG,
+ [ELIBACC] = LUSTRE_ELIBACC,
+ [ELIBBAD] = LUSTRE_ELIBBAD,
+ [ELIBSCN] = LUSTRE_ELIBSCN,
+ [ELIBMAX] = LUSTRE_ELIBMAX,
+ [ELIBEXEC] = LUSTRE_ELIBEXEC,
+ [EILSEQ] = LUSTRE_EILSEQ,
+ [ERESTART] = LUSTRE_ERESTART,
+ [ESTRPIPE] = LUSTRE_ESTRPIPE,
+ [EUSERS] = LUSTRE_EUSERS,
+ [ENOTSOCK] = LUSTRE_ENOTSOCK,
+ [EDESTADDRREQ] = LUSTRE_EDESTADDRREQ,
+ [EMSGSIZE] = LUSTRE_EMSGSIZE,
+ [EPROTOTYPE] = LUSTRE_EPROTOTYPE,
+ [ENOPROTOOPT] = LUSTRE_ENOPROTOOPT,
+ [EPROTONOSUPPORT] = LUSTRE_EPROTONOSUPPORT,
+ [ESOCKTNOSUPPORT] = LUSTRE_ESOCKTNOSUPPORT,
+ [EOPNOTSUPP] = LUSTRE_EOPNOTSUPP,
+ [EPFNOSUPPORT] = LUSTRE_EPFNOSUPPORT,
+ [EAFNOSUPPORT] = LUSTRE_EAFNOSUPPORT,
+ [EADDRINUSE] = LUSTRE_EADDRINUSE,
+ [EADDRNOTAVAIL] = LUSTRE_EADDRNOTAVAIL,
+ [ENETDOWN] = LUSTRE_ENETDOWN,
+ [ENETUNREACH] = LUSTRE_ENETUNREACH,
+ [ENETRESET] = LUSTRE_ENETRESET,
+ [ECONNABORTED] = LUSTRE_ECONNABORTED,
+ [ECONNRESET] = LUSTRE_ECONNRESET,
+ [ENOBUFS] = LUSTRE_ENOBUFS,
+ [EISCONN] = LUSTRE_EISCONN,
+ [ENOTCONN] = LUSTRE_ENOTCONN,
+ [ESHUTDOWN] = LUSTRE_ESHUTDOWN,
+ [ETOOMANYREFS] = LUSTRE_ETOOMANYREFS,
+ [ETIMEDOUT] = LUSTRE_ETIMEDOUT,
+ [ECONNREFUSED] = LUSTRE_ECONNREFUSED,
+ [EHOSTDOWN] = LUSTRE_EHOSTDOWN,
+ [EHOSTUNREACH] = LUSTRE_EHOSTUNREACH,
+ [EALREADY] = LUSTRE_EALREADY,
+ [EINPROGRESS] = LUSTRE_EINPROGRESS,
+ [ESTALE] = LUSTRE_ESTALE,
+ [EUCLEAN] = LUSTRE_EUCLEAN,
+ [ENOTNAM] = LUSTRE_ENOTNAM,
+ [ENAVAIL] = LUSTRE_ENAVAIL,
+ [EISNAM] = LUSTRE_EISNAM,
+ [EREMOTEIO] = LUSTRE_EREMOTEIO,
+ [EDQUOT] = LUSTRE_EDQUOT,
+ [ENOMEDIUM] = LUSTRE_ENOMEDIUM,
+ [EMEDIUMTYPE] = LUSTRE_EMEDIUMTYPE,
+ [ECANCELED] = LUSTRE_ECANCELED,
+ [ENOKEY] = LUSTRE_ENOKEY,
+ [EKEYEXPIRED] = LUSTRE_EKEYEXPIRED,
+ [EKEYREVOKED] = LUSTRE_EKEYREVOKED,
+ [EKEYREJECTED] = LUSTRE_EKEYREJECTED,
+ [EOWNERDEAD] = LUSTRE_EOWNERDEAD,
+ [ENOTRECOVERABLE] = LUSTRE_ENOTRECOVERABLE,
+ [ERESTARTSYS] = LUSTRE_ERESTARTSYS,
+ [ERESTARTNOINTR] = LUSTRE_ERESTARTNOINTR,
+ [ERESTARTNOHAND] = LUSTRE_ERESTARTNOHAND,
+ [ENOIOCTLCMD] = LUSTRE_ENOIOCTLCMD,
+ [ERESTART_RESTARTBLOCK] = LUSTRE_ERESTART_RESTARTBLOCK,
+ [EBADHANDLE] = LUSTRE_EBADHANDLE,
+ [ENOTSYNC] = LUSTRE_ENOTSYNC,
+ [EBADCOOKIE] = LUSTRE_EBADCOOKIE,
+ [ENOTSUPP] = LUSTRE_ENOTSUPP,
+ [ETOOSMALL] = LUSTRE_ETOOSMALL,
+ [ESERVERFAULT] = LUSTRE_ESERVERFAULT,
+ [EBADTYPE] = LUSTRE_EBADTYPE,
+ [EJUKEBOX] = LUSTRE_EJUKEBOX,
+ [EIOCBQUEUED] = LUSTRE_EIOCBQUEUED,
+};
+
+static int lustre_errno_ntoh_mapping[] = {
+ [LUSTRE_EPERM] = EPERM,
+ [LUSTRE_ENOENT] = ENOENT,
+ [LUSTRE_ESRCH] = ESRCH,
+ [LUSTRE_EINTR] = EINTR,
+ [LUSTRE_EIO] = EIO,
+ [LUSTRE_ENXIO] = ENXIO,
+ [LUSTRE_E2BIG] = E2BIG,
+ [LUSTRE_ENOEXEC] = ENOEXEC,
+ [LUSTRE_EBADF] = EBADF,
+ [LUSTRE_ECHILD] = ECHILD,
+ [LUSTRE_EAGAIN] = EAGAIN,
+ [LUSTRE_ENOMEM] = ENOMEM,
+ [LUSTRE_EACCES] = EACCES,
+ [LUSTRE_EFAULT] = EFAULT,
+ [LUSTRE_ENOTBLK] = ENOTBLK,
+ [LUSTRE_EBUSY] = EBUSY,
+ [LUSTRE_EEXIST] = EEXIST,
+ [LUSTRE_EXDEV] = EXDEV,
+ [LUSTRE_ENODEV] = ENODEV,
+ [LUSTRE_ENOTDIR] = ENOTDIR,
+ [LUSTRE_EISDIR] = EISDIR,
+ [LUSTRE_EINVAL] = EINVAL,
+ [LUSTRE_ENFILE] = ENFILE,
+ [LUSTRE_EMFILE] = EMFILE,
+ [LUSTRE_ENOTTY] = ENOTTY,
+ [LUSTRE_ETXTBSY] = ETXTBSY,
+ [LUSTRE_EFBIG] = EFBIG,
+ [LUSTRE_ENOSPC] = ENOSPC,
+ [LUSTRE_ESPIPE] = ESPIPE,
+ [LUSTRE_EROFS] = EROFS,
+ [LUSTRE_EMLINK] = EMLINK,
+ [LUSTRE_EPIPE] = EPIPE,
+ [LUSTRE_EDOM] = EDOM,
+ [LUSTRE_ERANGE] = ERANGE,
+ [LUSTRE_EDEADLK] = EDEADLK,
+ [LUSTRE_ENAMETOOLONG] = ENAMETOOLONG,
+ [LUSTRE_ENOLCK] = ENOLCK,
+ [LUSTRE_ENOSYS] = ENOSYS,
+ [LUSTRE_ENOTEMPTY] = ENOTEMPTY,
+ [LUSTRE_ELOOP] = ELOOP,
+ [LUSTRE_ENOMSG] = ENOMSG,
+ [LUSTRE_EIDRM] = EIDRM,
+ [LUSTRE_ECHRNG] = ECHRNG,
+ [LUSTRE_EL2NSYNC] = EL2NSYNC,
+ [LUSTRE_EL3HLT] = EL3HLT,
+ [LUSTRE_EL3RST] = EL3RST,
+ [LUSTRE_ELNRNG] = ELNRNG,
+ [LUSTRE_EUNATCH] = EUNATCH,
+ [LUSTRE_ENOCSI] = ENOCSI,
+ [LUSTRE_EL2HLT] = EL2HLT,
+ [LUSTRE_EBADE] = EBADE,
+ [LUSTRE_EBADR] = EBADR,
+ [LUSTRE_EXFULL] = EXFULL,
+ [LUSTRE_ENOANO] = ENOANO,
+ [LUSTRE_EBADRQC] = EBADRQC,
+ [LUSTRE_EBADSLT] = EBADSLT,
+ [LUSTRE_EBFONT] = EBFONT,
+ [LUSTRE_ENOSTR] = ENOSTR,
+ [LUSTRE_ENODATA] = ENODATA,
+ [LUSTRE_ETIME] = ETIME,
+ [LUSTRE_ENOSR] = ENOSR,
+ [LUSTRE_ENONET] = ENONET,
+ [LUSTRE_ENOPKG] = ENOPKG,
+ [LUSTRE_EREMOTE] = EREMOTE,
+ [LUSTRE_ENOLINK] = ENOLINK,
+ [LUSTRE_EADV] = EADV,
+ [LUSTRE_ESRMNT] = ESRMNT,
+ [LUSTRE_ECOMM] = ECOMM,
+ [LUSTRE_EPROTO] = EPROTO,
+ [LUSTRE_EMULTIHOP] = EMULTIHOP,
+ [LUSTRE_EDOTDOT] = EDOTDOT,
+ [LUSTRE_EBADMSG] = EBADMSG,
+ [LUSTRE_EOVERFLOW] = EOVERFLOW,
+ [LUSTRE_ENOTUNIQ] = ENOTUNIQ,
+ [LUSTRE_EBADFD] = EBADFD,
+ [LUSTRE_EREMCHG] = EREMCHG,
+ [LUSTRE_ELIBACC] = ELIBACC,
+ [LUSTRE_ELIBBAD] = ELIBBAD,
+ [LUSTRE_ELIBSCN] = ELIBSCN,
+ [LUSTRE_ELIBMAX] = ELIBMAX,
+ [LUSTRE_ELIBEXEC] = ELIBEXEC,
+ [LUSTRE_EILSEQ] = EILSEQ,
+ [LUSTRE_ERESTART] = ERESTART,
+ [LUSTRE_ESTRPIPE] = ESTRPIPE,
+ [LUSTRE_EUSERS] = EUSERS,
+ [LUSTRE_ENOTSOCK] = ENOTSOCK,
+ [LUSTRE_EDESTADDRREQ] = EDESTADDRREQ,
+ [LUSTRE_EMSGSIZE] = EMSGSIZE,
+ [LUSTRE_EPROTOTYPE] = EPROTOTYPE,
+ [LUSTRE_ENOPROTOOPT] = ENOPROTOOPT,
+ [LUSTRE_EPROTONOSUPPORT] = EPROTONOSUPPORT,
+ [LUSTRE_ESOCKTNOSUPPORT] = ESOCKTNOSUPPORT,
+ [LUSTRE_EOPNOTSUPP] = EOPNOTSUPP,
+ [LUSTRE_EPFNOSUPPORT] = EPFNOSUPPORT,
+ [LUSTRE_EAFNOSUPPORT] = EAFNOSUPPORT,
+ [LUSTRE_EADDRINUSE] = EADDRINUSE,
+ [LUSTRE_EADDRNOTAVAIL] = EADDRNOTAVAIL,
+ [LUSTRE_ENETDOWN] = ENETDOWN,
+ [LUSTRE_ENETUNREACH] = ENETUNREACH,
+ [LUSTRE_ENETRESET] = ENETRESET,
+ [LUSTRE_ECONNABORTED] = ECONNABORTED,
+ [LUSTRE_ECONNRESET] = ECONNRESET,
+ [LUSTRE_ENOBUFS] = ENOBUFS,
+ [LUSTRE_EISCONN] = EISCONN,
+ [LUSTRE_ENOTCONN] = ENOTCONN,
+ [LUSTRE_ESHUTDOWN] = ESHUTDOWN,
+ [LUSTRE_ETOOMANYREFS] = ETOOMANYREFS,
+ [LUSTRE_ETIMEDOUT] = ETIMEDOUT,
+ [LUSTRE_ECONNREFUSED] = ECONNREFUSED,
+ [LUSTRE_EHOSTDOWN] = EHOSTDOWN,
+ [LUSTRE_EHOSTUNREACH] = EHOSTUNREACH,
+ [LUSTRE_EALREADY] = EALREADY,
+ [LUSTRE_EINPROGRESS] = EINPROGRESS,
+ [LUSTRE_ESTALE] = ESTALE,
+ [LUSTRE_EUCLEAN] = EUCLEAN,
+ [LUSTRE_ENOTNAM] = ENOTNAM,
+ [LUSTRE_ENAVAIL] = ENAVAIL,
+ [LUSTRE_EISNAM] = EISNAM,
+ [LUSTRE_EREMOTEIO] = EREMOTEIO,
+ [LUSTRE_EDQUOT] = EDQUOT,
+ [LUSTRE_ENOMEDIUM] = ENOMEDIUM,
+ [LUSTRE_EMEDIUMTYPE] = EMEDIUMTYPE,
+ [LUSTRE_ECANCELED] = ECANCELED,
+ [LUSTRE_ENOKEY] = ENOKEY,
+ [LUSTRE_EKEYEXPIRED] = EKEYEXPIRED,
+ [LUSTRE_EKEYREVOKED] = EKEYREVOKED,
+ [LUSTRE_EKEYREJECTED] = EKEYREJECTED,
+ [LUSTRE_EOWNERDEAD] = EOWNERDEAD,
+ [LUSTRE_ENOTRECOVERABLE] = ENOTRECOVERABLE,
+ [LUSTRE_ERESTARTSYS] = ERESTARTSYS,
+ [LUSTRE_ERESTARTNOINTR] = ERESTARTNOINTR,
+ [LUSTRE_ERESTARTNOHAND] = ERESTARTNOHAND,
+ [LUSTRE_ENOIOCTLCMD] = ENOIOCTLCMD,
+ [LUSTRE_ERESTART_RESTARTBLOCK] = ERESTART_RESTARTBLOCK,
+ [LUSTRE_EBADHANDLE] = EBADHANDLE,
+ [LUSTRE_ENOTSYNC] = ENOTSYNC,
+ [LUSTRE_EBADCOOKIE] = EBADCOOKIE,
+ [LUSTRE_ENOTSUPP] = ENOTSUPP,
+ [LUSTRE_ETOOSMALL] = ETOOSMALL,
+ [LUSTRE_ESERVERFAULT] = ESERVERFAULT,
+ [LUSTRE_EBADTYPE] = EBADTYPE,
+ [LUSTRE_EJUKEBOX] = EJUKEBOX,
+ [LUSTRE_EIOCBQUEUED] = EIOCBQUEUED,
+};
+
+unsigned int lustre_errno_hton(unsigned int h)
+{
+ unsigned int n;
+
+ if (h == 0) {
+ n = 0;
+ } else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) {
+ n = lustre_errno_hton_mapping[h];
+ if (n == 0)
+ goto generic;
+ } else {
+generic:
+ /*
+ * A generic errno is better than the unknown one that could
+ * mean anything to a different host.
+ */
+ n = LUSTRE_EIO;
+ }
+
+ return n;
+}
+EXPORT_SYMBOL(lustre_errno_hton);
+
+unsigned int lustre_errno_ntoh(unsigned int n)
+{
+ unsigned int h;
+
+ if (n == 0) {
+ h = 0;
+ } else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) {
+ h = lustre_errno_ntoh_mapping[n];
+ if (h == 0)
+ goto generic;
+ } else {
+generic:
+ /*
+ * Similar to the situation in lustre_errno_hton(), an unknown
+ * network errno could coincide with anything. Hence, it is
+ * better to return a generic errno.
+ */
+ h = EIO;
+ }
+
+ return h;
+}
+EXPORT_SYMBOL(lustre_errno_ntoh);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c
new file mode 100644
index 000000000..7f8644e01
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/events.c
@@ -0,0 +1,585 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../../include/linux/libcfs/libcfs.h"
+# ifdef __mips64__
+# include <linux/kernel.h>
+# endif
+
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+#include "ptlrpc_internal.h"
+
+lnet_handle_eq_t ptlrpc_eq_h;
+
+/*
+ * Client's outgoing request callback
+ */
+void request_out_callback(lnet_event_t *ev)
+{
+ struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+ struct ptlrpc_request *req = cbid->cbid_arg;
+
+ LASSERT(ev->type == LNET_EVENT_SEND ||
+ ev->type == LNET_EVENT_UNLINK);
+ LASSERT(ev->unlinked);
+
+ DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+ sptlrpc_request_out_callback(req);
+ spin_lock(&req->rq_lock);
+ req->rq_real_sent = get_seconds();
+ if (ev->unlinked)
+ req->rq_req_unlink = 0;
+
+ if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+
+ /* Failed send: make it seem like the reply timed out, just
+ * like failing sends in client.c does currently... */
+
+ req->rq_net_err = 1;
+ ptlrpc_client_wake_req(req);
+ }
+ spin_unlock(&req->rq_lock);
+
+ ptlrpc_req_finished(req);
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(lnet_event_t *ev)
+{
+ struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+ struct ptlrpc_request *req = cbid->cbid_arg;
+
+ DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+ LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+ LASSERT(ev->md.start == req->rq_repbuf);
+ LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len);
+ /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+ for adaptive timeouts' early reply. */
+ LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
+ spin_lock(&req->rq_lock);
+
+ req->rq_receiving_reply = 0;
+ req->rq_early = 0;
+ if (ev->unlinked)
+ req->rq_reply_unlink = 0;
+
+ if (ev->status)
+ goto out_wake;
+
+ if (ev->type == LNET_EVENT_UNLINK) {
+ LASSERT(ev->unlinked);
+ DEBUG_REQ(D_NET, req, "unlink");
+ goto out_wake;
+ }
+
+ if (ev->mlength < ev->rlength) {
+ CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+ req->rq_replen, ev->rlength, ev->offset);
+ req->rq_reply_truncate = 1;
+ req->rq_replied = 1;
+ req->rq_status = -EOVERFLOW;
+ req->rq_nob_received = ev->rlength + ev->offset;
+ goto out_wake;
+ }
+
+ if ((ev->offset == 0) &&
+ ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+ /* Early reply */
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply received: mlen=%u offset=%d replen=%d replied=%d unlinked=%d",
+ ev->mlength, ev->offset,
+ req->rq_replen, req->rq_replied, ev->unlinked);
+
+ req->rq_early_count++; /* number received, client side */
+
+ if (req->rq_replied) /* already got the real reply */
+ goto out_wake;
+
+ req->rq_early = 1;
+ req->rq_reply_off = ev->offset;
+ req->rq_nob_received = ev->mlength;
+ /* And we're still receiving */
+ req->rq_receiving_reply = 1;
+ } else {
+ /* Real reply */
+ req->rq_rep_swab_mask = 0;
+ req->rq_replied = 1;
+ /* Got reply, no resend required */
+ req->rq_resend = 0;
+ req->rq_reply_off = ev->offset;
+ req->rq_nob_received = ev->mlength;
+ /* LNetMDUnlink can't be called under the LNET_LOCK,
+ so we must unlink in ptlrpc_unregister_reply */
+ DEBUG_REQ(D_INFO, req,
+ "reply in flags=%x mlen=%u offset=%d replen=%d",
+ lustre_msg_get_flags(req->rq_reqmsg),
+ ev->mlength, ev->offset, req->rq_replen);
+ }
+
+ req->rq_import->imp_last_reply_time = get_seconds();
+
+out_wake:
+ /* NB don't unlock till after wakeup; req can disappear under us
+ * since we don't have our own ref */
+ ptlrpc_client_wake_req(req);
+ spin_unlock(&req->rq_lock);
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback(lnet_event_t *ev)
+{
+ struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+ struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+ struct ptlrpc_request *req;
+
+ LASSERT((desc->bd_type == BULK_PUT_SINK &&
+ ev->type == LNET_EVENT_PUT) ||
+ (desc->bd_type == BULK_GET_SOURCE &&
+ ev->type == LNET_EVENT_GET) ||
+ ev->type == LNET_EVENT_UNLINK);
+ LASSERT(ev->unlinked);
+
+ if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+ ev->status = -EIO;
+
+ if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,
+ CFS_FAIL_ONCE))
+ ev->status = -EIO;
+
+ CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+ "event type %d, status %d, desc %p\n",
+ ev->type, ev->status, desc);
+
+ spin_lock(&desc->bd_lock);
+ req = desc->bd_req;
+ LASSERT(desc->bd_md_count > 0);
+ desc->bd_md_count--;
+
+ if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+ desc->bd_nob_transferred += ev->mlength;
+ desc->bd_sender = ev->sender;
+ } else {
+ /* start reconnect and resend if network error hit */
+ spin_lock(&req->rq_lock);
+ req->rq_net_err = 1;
+ spin_unlock(&req->rq_lock);
+ }
+
+ if (ev->status != 0)
+ desc->bd_failure = 1;
+
+ /* NB don't unlock till after wakeup; desc can disappear under us
+ * otherwise */
+ if (desc->bd_md_count == 0)
+ ptlrpc_client_wake_req(desc->bd_req);
+
+ spin_unlock(&desc->bd_lock);
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * | 32 bits | 16 bits | (16 - X)bits | X bits |
+ * ----------------------------------------------------
+ * | seconds | usec / 16 | sequence | CPT id |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt) ((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT 32
+#define REQS_USEC_SHIFT 16
+#define REQS_SEQ_SHIFT(svcpt) REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req)
+{
+ __u64 sec = req->rq_arrival_time.tv_sec;
+ __u32 usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
+ __u64 new_seq;
+
+ /* set sequence ID for request and add it to history list,
+ * it must be called with hold svcpt::scp_lock */
+
+ new_seq = (sec << REQS_SEC_SHIFT) |
+ (usec << REQS_USEC_SHIFT) |
+ (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+ if (new_seq > svcpt->scp_hist_seq) {
+ /* This handles the initial case of scp_hist_seq == 0 or
+ * we just jumped into a new time window */
+ svcpt->scp_hist_seq = new_seq;
+ } else {
+ LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+ /* NB: increase sequence number in current usec bucket,
+ * however, it's possible that we used up all bits for
+ * sequence and jumped into the next usec bucket (future time),
+ * then we hope there will be less RPCs per bucket at some
+ * point, and sequence will catch up again */
+ svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+ new_seq = svcpt->scp_hist_seq;
+ }
+
+ req->rq_history_seq = new_seq;
+
+ list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(lnet_event_t *ev)
+{
+ struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+ struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+ struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+ struct ptlrpc_service *service = svcpt->scp_service;
+ struct ptlrpc_request *req;
+
+ LASSERT(ev->type == LNET_EVENT_PUT ||
+ ev->type == LNET_EVENT_UNLINK);
+ LASSERT((char *)ev->md.start >= rqbd->rqbd_buffer);
+ LASSERT((char *)ev->md.start + ev->offset + ev->mlength <=
+ rqbd->rqbd_buffer + service->srv_buf_size);
+
+ CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+ "event type %d, status %d, service %s\n",
+ ev->type, ev->status, service->srv_name);
+
+ if (ev->unlinked) {
+ /* If this is the last request message to fit in the
+ * request buffer we can use the request object embedded in
+ * rqbd. Note that if we failed to allocate a request,
+ * we'd have to re-post the rqbd, which we can't do in this
+ * context. */
+ req = &rqbd->rqbd_req;
+ memset(req, 0, sizeof(*req));
+ } else {
+ LASSERT(ev->type == LNET_EVENT_PUT);
+ if (ev->status != 0) {
+ /* We moaned above already... */
+ return;
+ }
+ req = ptlrpc_request_cache_alloc(GFP_ATOMIC);
+ if (req == NULL) {
+ CERROR("Can't allocate incoming request descriptor: Dropping %s RPC from %s\n",
+ service->srv_name,
+ libcfs_id2str(ev->initiator));
+ return;
+ }
+ }
+
+ /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+ * flags are reset and scalars are zero. We only set the message
+ * size to non-zero if this was a successful receive. */
+ req->rq_xid = ev->match_bits;
+ req->rq_reqbuf = ev->md.start + ev->offset;
+ if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+ req->rq_reqdata_len = ev->mlength;
+ do_gettimeofday(&req->rq_arrival_time);
+ req->rq_peer = ev->initiator;
+ req->rq_self = ev->target.nid;
+ req->rq_rqbd = rqbd;
+ req->rq_phase = RQ_PHASE_NEW;
+ spin_lock_init(&req->rq_lock);
+ INIT_LIST_HEAD(&req->rq_timed_list);
+ INIT_LIST_HEAD(&req->rq_exp_list);
+ atomic_set(&req->rq_refcount, 1);
+ if (ev->type == LNET_EVENT_PUT)
+ CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n",
+ req, req->rq_xid, ev->mlength);
+
+ CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+
+ spin_lock(&svcpt->scp_lock);
+
+ ptlrpc_req_add_history(svcpt, req);
+
+ if (ev->unlinked) {
+ svcpt->scp_nrqbds_posted--;
+ CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+ svcpt->scp_nrqbds_posted);
+
+ /* Normally, don't complain about 0 buffers posted; LNET won't
+ * drop incoming reqs since we set the portal lazy */
+ if (test_req_buffer_pressure &&
+ ev->type != LNET_EVENT_UNLINK &&
+ svcpt->scp_nrqbds_posted == 0)
+ CWARN("All %s request buffers busy\n",
+ service->srv_name);
+
+ /* req takes over the network's ref on rqbd */
+ } else {
+ /* req takes a ref on rqbd */
+ rqbd->rqbd_refcount++;
+ }
+
+ list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+ svcpt->scp_nreqs_incoming++;
+
+ /* NB everything can disappear under us once the request
+ * has been queued and we unlock, so do the wake now... */
+ wake_up(&svcpt->scp_waitq);
+
+ spin_unlock(&svcpt->scp_lock);
+}
+
+/*
+ * Server's outgoing reply callback
+ */
+void reply_out_callback(lnet_event_t *ev)
+{
+ struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+ struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+ struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+ LASSERT(ev->type == LNET_EVENT_SEND ||
+ ev->type == LNET_EVENT_ACK ||
+ ev->type == LNET_EVENT_UNLINK);
+
+ if (!rs->rs_difficult) {
+ /* 'Easy' replies have no further processing so I drop the
+ * net's ref on 'rs' */
+ LASSERT(ev->unlinked);
+ ptlrpc_rs_decref(rs);
+ return;
+ }
+
+ LASSERT(rs->rs_on_net);
+
+ if (ev->unlinked) {
+ /* Last network callback. The net's ref on 'rs' stays put
+ * until ptlrpc_handle_rs() is done with it */
+ spin_lock(&svcpt->scp_rep_lock);
+ spin_lock(&rs->rs_lock);
+
+ rs->rs_on_net = 0;
+ if (!rs->rs_no_ack ||
+ rs->rs_transno <=
+ rs->rs_export->exp_obd->obd_last_committed)
+ ptlrpc_schedule_difficult_reply(rs);
+
+ spin_unlock(&rs->rs_lock);
+ spin_unlock(&svcpt->scp_rep_lock);
+ }
+}
+
+
+static void ptlrpc_master_callback(lnet_event_t *ev)
+{
+ struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+ void (*callback)(lnet_event_t *ev) = cbid->cbid_fn;
+
+ /* Honestly, it's best to find out early. */
+ LASSERT(cbid->cbid_arg != LP_POISON);
+ LASSERT(callback == request_out_callback ||
+ callback == reply_in_callback ||
+ callback == client_bulk_callback ||
+ callback == request_in_callback ||
+ callback == reply_out_callback);
+
+ callback(ev);
+}
+
+int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+ lnet_process_id_t *peer, lnet_nid_t *self)
+{
+ int best_dist = 0;
+ __u32 best_order = 0;
+ int count = 0;
+ int rc = -ENOENT;
+ int portals_compatibility;
+ int dist;
+ __u32 order;
+ lnet_nid_t dst_nid;
+ lnet_nid_t src_nid;
+
+ portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);
+
+ peer->pid = LUSTRE_SRV_LNET_PID;
+
+ /* Choose the matching UUID that's closest */
+ while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+ dist = LNetDist(dst_nid, &src_nid, &order);
+ if (dist < 0)
+ continue;
+
+ if (dist == 0) { /* local! use loopback LND */
+ peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+ rc = 0;
+ break;
+ }
+
+ if (rc < 0 ||
+ dist < best_dist ||
+ (dist == best_dist && order < best_order)) {
+ best_dist = dist;
+ best_order = order;
+
+ if (portals_compatibility > 1) {
+ /* Strong portals compatibility: Zero the nid's
+ * NET, so if I'm reading new config logs, or
+ * getting configured by (new) lconf I can
+ * still talk to old servers. */
+ dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
+ src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
+ }
+ peer->nid = dst_nid;
+ *self = src_nid;
+ rc = 0;
+ }
+ }
+
+ CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+ return rc;
+}
+
+void ptlrpc_ni_fini(void)
+{
+ wait_queue_head_t waitq;
+ struct l_wait_info lwi;
+ int rc;
+ int retries;
+
+ /* Wait for the event queue to become idle since there may still be
+ * messages in flight with pending events (i.e. the fire-and-forget
+ * messages == client requests and "non-difficult" server
+ * replies */
+
+ for (retries = 0;; retries++) {
+ rc = LNetEQFree(ptlrpc_eq_h);
+ switch (rc) {
+ default:
+ LBUG();
+
+ case 0:
+ LNetNIFini();
+ return;
+
+ case -EBUSY:
+ if (retries != 0)
+ CWARN("Event queue still busy\n");
+
+ /* Wait for a bit */
+ init_waitqueue_head(&waitq);
+ lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
+ l_wait_event(waitq, 0, &lwi);
+ break;
+ }
+ }
+ /* notreached */
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+ lnet_pid_t pid;
+
+ pid = LUSTRE_SRV_LNET_PID;
+ return pid;
+}
+
+int ptlrpc_ni_init(void)
+{
+ int rc;
+ lnet_pid_t pid;
+
+ pid = ptl_get_pid();
+ CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+ /* We're not passing any limits yet... */
+ rc = LNetNIInit(pid);
+ if (rc < 0) {
+ CDEBUG(D_NET, "Can't init network interface: %d\n", rc);
+ return -ENOENT;
+ }
+
+ /* CAVEAT EMPTOR: how we process portals events is _radically_
+ * different depending on... */
+ /* kernel LNet calls our master callback when there are new event,
+ * because we are guaranteed to get every event via callback,
+ * so we just set EQ size to 0 to avoid overhead of serializing
+ * enqueue/dequeue operations in LNet. */
+ rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
+ if (rc == 0)
+ return 0;
+
+ CERROR("Failed to allocate event queue: %d\n", rc);
+ LNetNIFini();
+
+ return -ENOMEM;
+}
+
+
+int ptlrpc_init_portals(void)
+{
+ int rc = ptlrpc_ni_init();
+
+ if (rc != 0) {
+ CERROR("network initialisation failed\n");
+ return -EIO;
+ }
+ rc = ptlrpcd_addref();
+ if (rc == 0)
+ return 0;
+
+ CERROR("rpcd initialisation failed\n");
+ ptlrpc_ni_fini();
+ return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+ ptlrpcd_decref();
+ ptlrpc_ni_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c
new file mode 100644
index 000000000..d5fc689c0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/import.c
@@ -0,0 +1,1642 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_export.h"
+#include "../include/obd.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+ __u64 pcaa_peer_committed;
+ int pcaa_initial_connect;
+};
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function. Must be called under imp_lock.
+ */
+static void __import_set_state(struct obd_import *imp,
+ enum lustre_imp_state state)
+{
+ switch (state) {
+ case LUSTRE_IMP_CLOSED:
+ case LUSTRE_IMP_NEW:
+ case LUSTRE_IMP_DISCON:
+ case LUSTRE_IMP_CONNECTING:
+ break;
+ case LUSTRE_IMP_REPLAY_WAIT:
+ imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
+ break;
+ default:
+ imp->imp_replay_state = LUSTRE_IMP_REPLAY;
+ }
+
+ imp->imp_state = state;
+ imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+ imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+ get_seconds();
+ imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+ IMP_STATE_HIST_LEN;
+}
+
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state) \
+do { \
+ if (imp->imp_state != LUSTRE_IMP_CLOSED) { \
+ CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \
+ imp, obd2cli_tgt(imp->imp_obd), \
+ ptlrpc_import_state_name(imp->imp_state), \
+ ptlrpc_import_state_name(state)); \
+ __import_set_state(imp, state); \
+ } \
+} while (0)
+
+#define IMPORT_SET_STATE(imp, state) \
+do { \
+ spin_lock(&imp->imp_lock); \
+ IMPORT_SET_STATE_NOLOCK(imp, state); \
+ spin_unlock(&imp->imp_lock); \
+} while (0)
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+ struct ptlrpc_request *request,
+ void *data, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+ spin_lock(&imp->imp_lock);
+
+ imp->imp_generation++;
+ imp->imp_state = LUSTRE_IMP_NEW;
+
+ spin_unlock(&imp->imp_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+ *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+ ? uuid : uuid + strlen(prefix);
+
+ *uuid_len = strlen(*uuid_start);
+
+ if (*uuid_len < strlen(UUID_STR))
+ return;
+
+ if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+ UUID_STR, strlen(UUID_STR)))
+ *uuid_len -= strlen(UUID_STR);
+}
+EXPORT_SYMBOL(deuuidify);
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ * and caused the disconnection. In some cases, multiple
+ * inflight requests can fail to a single target (e.g. OST
+ * bulk requests) and if one has already caused a reconnection
+ * (increasing the import->conn_cnt) the older failure should
+ * not also cause a reconnection. If zero it forces a reconnect.
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+{
+ int rc = 0;
+
+ spin_lock(&imp->imp_lock);
+
+ if (imp->imp_state == LUSTRE_IMP_FULL &&
+ (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+ char *target_start;
+ int target_len;
+
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+ &target_start, &target_len);
+
+ if (imp->imp_replayable) {
+ LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n",
+ imp->imp_obd->obd_name, target_len, target_start,
+ libcfs_nid2str(imp->imp_connection->c_peer.nid));
+ } else {
+ LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n",
+ imp->imp_obd->obd_name,
+ target_len, target_start,
+ libcfs_nid2str(imp->imp_connection->c_peer.nid));
+ }
+ IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+ spin_unlock(&imp->imp_lock);
+
+ if (obd_dump_on_timeout)
+ libcfs_debug_dumplog();
+
+ obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+ rc = 1;
+ } else {
+ spin_unlock(&imp->imp_lock);
+ CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+ imp->imp_client->cli_name, imp,
+ (imp->imp_state == LUSTRE_IMP_FULL &&
+ imp->imp_conn_cnt > conn_cnt) ?
+ "reconnected" : "not connected", imp->imp_conn_cnt,
+ conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+ }
+
+ return rc;
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+ assert_spin_locked(&imp->imp_lock);
+
+ CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+ imp->imp_invalid = 1;
+ imp->imp_generation++;
+ spin_unlock(&imp->imp_lock);
+
+ ptlrpc_abort_inflight(imp);
+ obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+ spin_lock(&imp->imp_lock);
+ ptlrpc_deactivate_and_unlock_import(imp);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now)
+{
+ long dl;
+
+ if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+ (req->rq_phase == RQ_PHASE_BULK) ||
+ (req->rq_phase == RQ_PHASE_NEW)))
+ return 0;
+
+ if (req->rq_timedout)
+ return 0;
+
+ if (req->rq_phase == RQ_PHASE_NEW)
+ dl = req->rq_sent;
+ else
+ dl = req->rq_deadline;
+
+ if (dl <= now)
+ return 0;
+
+ return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+ time_t now = get_seconds();
+ struct list_head *tmp, *n;
+ struct ptlrpc_request *req;
+ unsigned int timeout = 0;
+
+ spin_lock(&imp->imp_lock);
+ list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_list);
+ timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+ }
+ spin_unlock(&imp->imp_lock);
+ return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+ struct list_head *tmp, *n;
+ struct ptlrpc_request *req;
+ struct l_wait_info lwi;
+ unsigned int timeout;
+ int rc;
+
+ atomic_inc(&imp->imp_inval_count);
+
+ if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+ ptlrpc_deactivate_import(imp);
+
+ CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
+ LASSERT(imp->imp_invalid);
+
+ /* Wait forever until inflight == 0. We really can't do it another
+ * way because in some cases we need to wait for very long reply
+ * unlink. We can't do anything before that because there is really
+ * no guarantee that some rdma transfer is not in progress right now. */
+ do {
+ /* Calculate max timeout for waiting on rpcs to error
+ * out. Use obd_timeout if calculated value is smaller
+ * than it. */
+ if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+ timeout = ptlrpc_inflight_timeout(imp);
+ timeout += timeout / 3;
+
+ if (timeout == 0)
+ timeout = obd_timeout;
+ } else {
+ /* decrease the interval to increase race condition */
+ timeout = 1;
+ }
+
+ CDEBUG(D_RPCTRACE,
+ "Sleeping %d sec for inflight to error out\n",
+ timeout);
+
+ /* Wait for all requests to error out and call completion
+ * callbacks. Cap it at obd_timeout -- these should all
+ * have been locally cancelled by ptlrpc_abort_inflight. */
+ lwi = LWI_TIMEOUT_INTERVAL(
+ cfs_timeout_cap(cfs_time_seconds(timeout)),
+ (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+ NULL, NULL);
+ rc = l_wait_event(imp->imp_recovery_waitq,
+ (atomic_read(&imp->imp_inflight) == 0),
+ &lwi);
+ if (rc) {
+ const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+ CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+ cli_tgt, rc,
+ atomic_read(&imp->imp_inflight));
+
+ spin_lock(&imp->imp_lock);
+ if (atomic_read(&imp->imp_inflight) == 0) {
+ int count = atomic_read(&imp->imp_unregistering);
+
+ /* We know that "unregistering" rpcs only can
+ * survive in sending or delaying lists (they
+ * maybe waiting for long reply unlink in
+ * sluggish nets). Let's check this. If there
+ * is no inflight and unregistering != 0, this
+ * is bug. */
+ LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n",
+ count);
+
+ /* Let's save one loop as soon as inflight have
+ * dropped to zero. No new inflights possible at
+ * this point. */
+ rc = 0;
+ } else {
+ list_for_each_safe(tmp, n,
+ &imp->imp_sending_list) {
+ req = list_entry(tmp,
+ struct ptlrpc_request,
+ rq_list);
+ DEBUG_REQ(D_ERROR, req,
+ "still on sending list");
+ }
+ list_for_each_safe(tmp, n,
+ &imp->imp_delayed_list) {
+ req = list_entry(tmp,
+ struct ptlrpc_request,
+ rq_list);
+ DEBUG_REQ(D_ERROR, req,
+ "still on delayed list");
+ }
+
+ CERROR("%s: RPCs in \"%s\" phase found (%d). Network is sluggish? Waiting them to error out.\n",
+ cli_tgt,
+ ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
+ atomic_read(&imp->
+ imp_unregistering));
+ }
+ spin_unlock(&imp->imp_lock);
+ }
+ } while (rc != 0);
+
+ /*
+ * Let's additionally check that no new rpcs added to import in
+ * "invalidate" state.
+ */
+ LASSERT(atomic_read(&imp->imp_inflight) == 0);
+ obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+ sptlrpc_import_flush_all_ctx(imp);
+
+ atomic_dec(&imp->imp_inval_count);
+ wake_up_all(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp)
+{
+ struct obd_device *obd = imp->imp_obd;
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_deactive != 0) {
+ spin_unlock(&imp->imp_lock);
+ return;
+ }
+
+ imp->imp_invalid = 0;
+ spin_unlock(&imp->imp_lock);
+ obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+static void ptlrpc_pinger_force(struct obd_import *imp)
+{
+ CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
+ ptlrpc_import_state_name(imp->imp_state));
+
+ spin_lock(&imp->imp_lock);
+ imp->imp_force_verify = 1;
+ spin_unlock(&imp->imp_lock);
+
+ if (imp->imp_state != LUSTRE_IMP_CONNECTING)
+ ptlrpc_pinger_wake_up();
+}
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+ LASSERT(!imp->imp_dlm_fake);
+
+ if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+ if (!imp->imp_replayable) {
+ CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid,
+ imp->imp_obd->obd_name);
+ ptlrpc_deactivate_import(imp);
+ }
+
+ ptlrpc_pinger_force(imp);
+ }
+}
+EXPORT_SYMBOL(ptlrpc_fail_import);
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+#ifdef ENABLE_PINGER
+ struct l_wait_info lwi;
+ int secs = cfs_time_seconds(obd_timeout);
+ int rc;
+
+ ptlrpc_pinger_force(imp);
+
+ CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+ obd2cli_tgt(imp->imp_obd), secs);
+
+ lwi = LWI_TIMEOUT(secs, NULL, NULL);
+ rc = l_wait_event(imp->imp_recovery_waitq,
+ !ptlrpc_import_in_recovery(imp), &lwi);
+ CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
+ ptlrpc_import_state_name(imp->imp_state));
+ return rc;
+#else
+ ptlrpc_set_import_discon(imp, 0);
+ /* Force a new connect attempt */
+ ptlrpc_invalidate_import(imp);
+ /* Do a fresh connect next time by zeroing the handle */
+ ptlrpc_disconnect_import(imp, 1);
+ /* Wait for all invalidate calls to finish */
+ if (atomic_read(&imp->imp_inval_count) > 0) {
+ int rc;
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+ rc = l_wait_event(imp->imp_recovery_waitq,
+ (atomic_read(&imp->imp_inval_count) == 0),
+ &lwi);
+ if (rc)
+ CERROR("Interrupted, inval=%d\n",
+ atomic_read(&imp->imp_inval_count));
+ }
+
+ /* Allow reconnect attempts */
+ imp->imp_obd->obd_no_recov = 0;
+ /* Remove 'invalid' flag */
+ ptlrpc_activate_import(imp);
+ /* Attempt a new connect */
+ ptlrpc_recover_import(imp, NULL, 0);
+ return 0;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+ struct obd_import_conn *imp_conn = NULL, *conn;
+ struct obd_export *dlmexp;
+ char *target_start;
+ int target_len, tried_all = 1;
+
+ spin_lock(&imp->imp_lock);
+
+ if (list_empty(&imp->imp_conn_list)) {
+ CERROR("%s: no connections available\n",
+ imp->imp_obd->obd_name);
+ spin_unlock(&imp->imp_lock);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+ CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
+ imp->imp_obd->obd_name,
+ libcfs_nid2str(conn->oic_conn->c_peer.nid),
+ conn->oic_last_attempt);
+
+ /* If we have not tried this connection since
+ the last successful attempt, go with this one */
+ if ((conn->oic_last_attempt == 0) ||
+ cfs_time_beforeq_64(conn->oic_last_attempt,
+ imp->imp_last_success_conn)) {
+ imp_conn = conn;
+ tried_all = 0;
+ break;
+ }
+
+ /* If all of the connections have already been tried
+ since the last successful connection; just choose the
+ least recently used */
+ if (!imp_conn)
+ imp_conn = conn;
+ else if (cfs_time_before_64(conn->oic_last_attempt,
+ imp_conn->oic_last_attempt))
+ imp_conn = conn;
+ }
+
+ /* if not found, simply choose the current one */
+ if (!imp_conn || imp->imp_force_reconnect) {
+ LASSERT(imp->imp_conn_current);
+ imp_conn = imp->imp_conn_current;
+ tried_all = 0;
+ }
+ LASSERT(imp_conn->oic_conn);
+
+ /* If we've tried everything, and we're back to the beginning of the
+ list, increase our timeout and try again. It will be reset when
+ we do finally connect. (FIXME: really we should wait for all network
+ state associated with the last connection attempt to drain before
+ trying to reconnect on it.) */
+ if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+ struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+ if (at_get(at) < CONNECTION_SWITCH_MAX) {
+ at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+ if (at_get(at) > CONNECTION_SWITCH_MAX)
+ at_reset(at, CONNECTION_SWITCH_MAX);
+ }
+ LASSERT(imp_conn->oic_last_attempt);
+ CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n",
+ imp->imp_obd->obd_name, at_get(at));
+ }
+
+ imp_conn->oic_last_attempt = cfs_time_current_64();
+
+ /* switch connection, don't mind if it's same as the current one */
+ if (imp->imp_connection)
+ ptlrpc_connection_put(imp->imp_connection);
+ imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+ dlmexp = class_conn2export(&imp->imp_dlm_handle);
+ LASSERT(dlmexp != NULL);
+ if (dlmexp->exp_connection)
+ ptlrpc_connection_put(dlmexp->exp_connection);
+ dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+ class_export_put(dlmexp);
+
+ if (imp->imp_conn_current != imp_conn) {
+ if (imp->imp_conn_current) {
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+ &target_start, &target_len);
+
+ CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n",
+ imp->imp_obd->obd_name,
+ target_len, target_start,
+ libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+ }
+
+ imp->imp_conn_current = imp_conn;
+ }
+
+ CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+ imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+ libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+
+ spin_unlock(&imp->imp_lock);
+
+ return 0;
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+ struct ptlrpc_request *req;
+ struct list_head *tmp;
+
+ /* The requests in committed_list always have smaller transnos than
+ * the requests in replay_list */
+ if (!list_empty(&imp->imp_committed_list)) {
+ tmp = imp->imp_committed_list.next;
+ req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+ *transno = req->rq_transno;
+ if (req->rq_transno == 0) {
+ DEBUG_REQ(D_ERROR, req,
+ "zero transno in committed_list");
+ LBUG();
+ }
+ return 1;
+ }
+ if (!list_empty(&imp->imp_replay_list)) {
+ tmp = imp->imp_replay_list.next;
+ req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+ *transno = req->rq_transno;
+ if (req->rq_transno == 0) {
+ DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
+ LBUG();
+ }
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+ struct obd_device *obd = imp->imp_obd;
+ int initial_connect = 0;
+ int set_transno = 0;
+ __u64 committed_before_reconnect = 0;
+ struct ptlrpc_request *request;
+ char *bufs[] = { NULL,
+ obd2cli_tgt(imp->imp_obd),
+ obd->obd_uuid.uuid,
+ (char *)&imp->imp_dlm_handle,
+ (char *)&imp->imp_connect_data };
+ struct ptlrpc_connect_async_args *aa;
+ int rc;
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+ spin_unlock(&imp->imp_lock);
+ CERROR("can't connect to a closed import\n");
+ return -EINVAL;
+ } else if (imp->imp_state == LUSTRE_IMP_FULL) {
+ spin_unlock(&imp->imp_lock);
+ CERROR("already connected\n");
+ return 0;
+ } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
+ spin_unlock(&imp->imp_lock);
+ CERROR("already connecting\n");
+ return -EALREADY;
+ }
+
+ IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+
+ imp->imp_conn_cnt++;
+ imp->imp_resend_replay = 0;
+
+ if (!lustre_handle_is_used(&imp->imp_remote_handle))
+ initial_connect = 1;
+ else
+ committed_before_reconnect = imp->imp_peer_committed_transno;
+
+ set_transno = ptlrpc_first_transno(imp,
+ &imp->imp_connect_data.ocd_transno);
+ spin_unlock(&imp->imp_lock);
+
+ rc = import_select_connection(imp);
+ if (rc)
+ goto out;
+
+ rc = sptlrpc_import_sec_adapt(imp, NULL, NULL);
+ if (rc)
+ goto out;
+
+ /* Reset connect flags to the originally requested flags, in case
+ * the server is updated on-the-fly we will get the new features. */
+ imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+ /* Reset ocd_version each time so the server knows the exact versions */
+ imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+ imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+ imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+ rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+ &obd->obd_uuid, &imp->imp_connect_data, NULL);
+ if (rc)
+ goto out;
+
+ request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+ if (request == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+ imp->imp_connect_op, bufs, NULL);
+ if (rc) {
+ ptlrpc_request_free(request);
+ goto out;
+ }
+
+ /* Report the rpc service time to the server so that it knows how long
+ * to wait for clients to join recovery */
+ lustre_msg_set_service_time(request->rq_reqmsg,
+ at_timeout2est(request->rq_timeout));
+
+ /* The amount of time we give the server to process the connect req.
+ * import_select_connection will increase the net latency on
+ * repeated reconnect attempts to cover slow networks.
+ * We override/ignore the server rpc completion estimate here,
+ * which may be large if this is a reconnect attempt */
+ request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+ lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+ lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
+
+ request->rq_no_resend = request->rq_no_delay = 1;
+ request->rq_send_state = LUSTRE_IMP_CONNECTING;
+ /* Allow a slightly larger reply for future growth compatibility */
+ req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+ sizeof(struct obd_connect_data)+16*sizeof(__u64));
+ ptlrpc_request_set_replen(request);
+ request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+ CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
+ aa = ptlrpc_req_async_args(request);
+ memset(aa, 0, sizeof(*aa));
+
+ aa->pcaa_peer_committed = committed_before_reconnect;
+ aa->pcaa_initial_connect = initial_connect;
+
+ if (aa->pcaa_initial_connect) {
+ spin_lock(&imp->imp_lock);
+ imp->imp_replayable = 1;
+ spin_unlock(&imp->imp_lock);
+ lustre_msg_add_op_flags(request->rq_reqmsg,
+ MSG_CONNECT_INITIAL);
+ }
+
+ if (set_transno)
+ lustre_msg_add_op_flags(request->rq_reqmsg,
+ MSG_CONNECT_TRANSNO);
+
+ DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+ request->rq_timeout);
+ ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1);
+ rc = 0;
+out:
+ if (rc != 0) {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+ int force_verify;
+
+ spin_lock(&imp->imp_lock);
+ force_verify = imp->imp_force_verify != 0;
+ spin_unlock(&imp->imp_lock);
+
+ if (force_verify)
+ ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+ return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+ struct ptlrpc_request *request,
+ void *data, int rc)
+{
+ struct ptlrpc_connect_async_args *aa = data;
+ struct obd_import *imp = request->rq_import;
+ struct client_obd *cli = &imp->imp_obd->u.cli;
+ struct lustre_handle old_hdl;
+ __u64 old_connect_flags;
+ int msg_flags;
+ struct obd_connect_data *ocd;
+ struct obd_export *exp;
+ int ret;
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+ imp->imp_connect_tried = 1;
+ spin_unlock(&imp->imp_lock);
+ return 0;
+ }
+
+ if (rc) {
+ /* if this reconnect to busy export - not need select new target
+ * for connecting*/
+ imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+ spin_unlock(&imp->imp_lock);
+ ptlrpc_maybe_ping_import_soon(imp);
+ goto out;
+ }
+ spin_unlock(&imp->imp_lock);
+
+ LASSERT(imp->imp_conn_current);
+
+ msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+ ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+ RCL_SERVER);
+ /* server replied obd_connect_data is always bigger */
+ ocd = req_capsule_server_sized_get(&request->rq_pill,
+ &RMF_CONNECT_DATA, ret);
+
+ if (ocd == NULL) {
+ CERROR("%s: no connect data from server\n",
+ imp->imp_obd->obd_name);
+ rc = -EPROTO;
+ goto out;
+ }
+
+ spin_lock(&imp->imp_lock);
+
+ /* All imports are pingable */
+ imp->imp_pingable = 1;
+ imp->imp_force_reconnect = 0;
+ imp->imp_force_verify = 0;
+
+ imp->imp_connect_data = *ocd;
+
+ CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+ imp->imp_obd->obd_name, ocd->ocd_instance);
+ exp = class_conn2export(&imp->imp_dlm_handle);
+
+ spin_unlock(&imp->imp_lock);
+
+ /* check that server granted subset of flags we asked for. */
+ if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+ ocd->ocd_connect_flags) {
+ CERROR("%s: Server didn't granted asked subset of flags: asked=%#llx grranted=%#llx\n",
+ imp->imp_obd->obd_name, imp->imp_connect_flags_orig,
+ ocd->ocd_connect_flags);
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if (!exp) {
+ /* This could happen if export is cleaned during the
+ connect attempt */
+ CERROR("%s: missing export after connect\n",
+ imp->imp_obd->obd_name);
+ rc = -ENODEV;
+ goto out;
+ }
+ old_connect_flags = exp_connect_flags(exp);
+ exp->exp_connect_data = *ocd;
+ imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+ class_export_put(exp);
+
+ obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+ if (aa->pcaa_initial_connect) {
+ spin_lock(&imp->imp_lock);
+ if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+ imp->imp_replayable = 1;
+ spin_unlock(&imp->imp_lock);
+ CDEBUG(D_HA, "connected to replayable target: %s\n",
+ obd2cli_tgt(imp->imp_obd));
+ } else {
+ imp->imp_replayable = 0;
+ spin_unlock(&imp->imp_lock);
+ }
+
+ /* if applies, adjust the imp->imp_msg_magic here
+ * according to reply flags */
+
+ imp->imp_remote_handle =
+ *lustre_msg_get_handle(request->rq_repmsg);
+
+ /* Initial connects are allowed for clients with non-random
+ * uuids when servers are in recovery. Simply signal the
+ * servers replay is complete and wait in REPLAY_WAIT. */
+ if (msg_flags & MSG_CONNECT_RECOVERING) {
+ CDEBUG(D_HA, "connect to %s during recovery\n",
+ obd2cli_tgt(imp->imp_obd));
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+ } else {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+ ptlrpc_activate_import(imp);
+ }
+
+ rc = 0;
+ goto finish;
+ }
+
+ /* Determine what recovery state to move the import to. */
+ if (MSG_CONNECT_RECONNECT & msg_flags) {
+ memset(&old_hdl, 0, sizeof(old_hdl));
+ if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+ sizeof(old_hdl))) {
+ LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid,
+ imp->imp_dlm_handle.cookie);
+ rc = -ENOTCONN;
+ goto out;
+ }
+
+ if (memcmp(&imp->imp_remote_handle,
+ lustre_msg_get_handle(request->rq_repmsg),
+ sizeof(imp->imp_remote_handle))) {
+ int level = msg_flags & MSG_CONNECT_RECOVERING ?
+ D_HA : D_WARNING;
+
+ /* Bug 16611/14775: if server handle have changed,
+ * that means some sort of disconnection happened.
+ * If the server is not in recovery, that also means it
+ * already erased all of our state because of previous
+ * eviction. If it is in recovery - we are safe to
+ * participate since we can reestablish all of our state
+ * with server again */
+ if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+ CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid,
+ imp->imp_remote_handle.cookie,
+ lustre_msg_get_handle(
+ request->rq_repmsg)->cookie);
+ } else {
+ LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection-> \
+ c_remote_uuid.uuid,
+ imp->imp_remote_handle.cookie,
+ lustre_msg_get_handle(
+ request->rq_repmsg)->cookie);
+ }
+
+
+ imp->imp_remote_handle =
+ *lustre_msg_get_handle(request->rq_repmsg);
+
+ if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+ rc = 0;
+ goto finish;
+ }
+
+ } else {
+ CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid);
+ }
+
+ if (imp->imp_invalid) {
+ CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n",
+ imp->imp_obd->obd_name);
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+ } else if (MSG_CONNECT_RECOVERING & msg_flags) {
+ CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+ imp->imp_obd->obd_name,
+ obd2cli_tgt(imp->imp_obd));
+
+ spin_lock(&imp->imp_lock);
+ imp->imp_resend_replay = 1;
+ spin_unlock(&imp->imp_lock);
+
+ IMPORT_SET_STATE(imp, imp->imp_replay_state);
+ } else {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+ }
+ } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+ LASSERT(imp->imp_replayable);
+ imp->imp_remote_handle =
+ *lustre_msg_get_handle(request->rq_repmsg);
+ imp->imp_last_replay_transno = 0;
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+ } else {
+ DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)",
+ imp->imp_obd->obd_name, msg_flags);
+ imp->imp_remote_handle =
+ *lustre_msg_get_handle(request->rq_repmsg);
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+ }
+
+ /* Sanity checks for a reconnected import. */
+ if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
+ CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n");
+ }
+
+ if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+ lustre_msg_get_last_committed(request->rq_repmsg) <
+ aa->pcaa_peer_committed) {
+ CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)! See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n",
+ obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+ lustre_msg_get_last_committed(request->rq_repmsg));
+ }
+
+finish:
+ rc = ptlrpc_import_recovery_state_machine(imp);
+ if (rc != 0) {
+ if (rc == -ENOTCONN) {
+ CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid);
+ ptlrpc_connect_import(imp);
+ imp->imp_connect_tried = 1;
+ return 0;
+ }
+ } else {
+
+ spin_lock(&imp->imp_lock);
+ list_del(&imp->imp_conn_current->oic_item);
+ list_add(&imp->imp_conn_current->oic_item,
+ &imp->imp_conn_list);
+ imp->imp_last_success_conn =
+ imp->imp_conn_current->oic_last_attempt;
+
+ spin_unlock(&imp->imp_lock);
+
+ if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) &&
+ !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) {
+ LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n",
+ imp->imp_obd->obd_name,
+ imp->imp_connection->c_remote_uuid.uuid,
+ imp->imp_connect_flags_orig,
+ ocd->ocd_connect_flags);
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+ (ocd->ocd_version > LUSTRE_VERSION_CODE +
+ LUSTRE_VERSION_OFFSET_WARN ||
+ ocd->ocd_version < LUSTRE_VERSION_CODE -
+ LUSTRE_VERSION_OFFSET_WARN)) {
+ /* Sigh, some compilers do not like #ifdef in the middle
+ of macro arguments */
+ const char *older = "older. Consider upgrading server or downgrading client"
+ ;
+ const char *newer = "newer than client version. Consider upgrading client"
+ ;
+
+ LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n",
+ obd2cli_tgt(imp->imp_obd),
+ OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+ OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+ OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+ OBD_OCD_VERSION_FIX(ocd->ocd_version),
+ ocd->ocd_version > LUSTRE_VERSION_CODE ?
+ newer : older, LUSTRE_VERSION_STRING);
+ }
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+ /* Check if server has LU-1252 fix applied to not always swab
+ * the IR MNE entries. Do this only once per connection. This
+ * fixup is version-limited, because we don't want to carry the
+ * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+ * need interop with unpatched 2.2 servers. For newer servers,
+ * the client will do MNE swabbing only as needed. LU-1644 */
+ if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+ !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+ OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+ OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+ OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+ strcmp(imp->imp_obd->obd_type->typ_name,
+ LUSTRE_MGC_NAME) == 0))
+ imp->imp_need_mne_swab = 1;
+ else /* clear if server was upgraded since last connect */
+ imp->imp_need_mne_swab = 0;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+ if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+ /* We sent to the server ocd_cksum_types with bits set
+ * for algorithms we understand. The server masked off
+ * the checksum types it doesn't support */
+ if ((ocd->ocd_cksum_types &
+ cksum_types_supported_client()) == 0) {
+ LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n",
+ obd2cli_tgt(imp->imp_obd),
+ ocd->ocd_cksum_types,
+ cksum_types_supported_client());
+ cli->cl_checksum = 0;
+ cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+ } else {
+ cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+ }
+ } else {
+ /* The server does not support OBD_CONNECT_CKSUM.
+ * Enforce ADLER for backward compatibility*/
+ cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+ }
+ cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
+
+ if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+ cli->cl_max_pages_per_rpc =
+ min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT,
+ cli->cl_max_pages_per_rpc);
+ else if (imp->imp_connect_op == MDS_CONNECT ||
+ imp->imp_connect_op == MGS_CONNECT)
+ cli->cl_max_pages_per_rpc = 1;
+
+ /* Reset ns_connect_flags only for initial connect. It might be
+ * changed in while using FS and if we reset it in reconnect
+ * this leads to losing user settings done before such as
+ * disable lru_resize, etc. */
+ if (old_connect_flags != exp_connect_flags(exp) ||
+ aa->pcaa_initial_connect) {
+ CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n",
+ imp->imp_obd->obd_name, ocd->ocd_connect_flags);
+ imp->imp_obd->obd_namespace->ns_connect_flags =
+ ocd->ocd_connect_flags;
+ imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+ ocd->ocd_connect_flags;
+ }
+
+ if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+ (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+ /* We need a per-message support flag, because
+ a. we don't know if the incoming connect reply
+ supports AT or not (in reply_in_callback)
+ until we unpack it.
+ b. failovered server means export and flags are gone
+ (in ptlrpc_send_reply).
+ Can only be set when we know AT is supported at
+ both ends */
+ imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+ else
+ imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+ if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
+ (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+ imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+ else
+ imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+ LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+ (cli->cl_max_pages_per_rpc > 0));
+ }
+
+out:
+ imp->imp_connect_tried = 1;
+
+ if (rc != 0) {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+ if (rc == -EACCES) {
+ /*
+ * Give up trying to reconnect
+ * EACCES means client has no permission for connection
+ */
+ imp->imp_obd->obd_no_recov = 1;
+ ptlrpc_deactivate_import(imp);
+ }
+
+ if (rc == -EPROTO) {
+ struct obd_connect_data *ocd;
+
+ /* reply message might not be ready */
+ if (request->rq_repmsg == NULL)
+ return -EPROTO;
+
+ ocd = req_capsule_server_get(&request->rq_pill,
+ &RMF_CONNECT_DATA);
+ if (ocd &&
+ (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+ (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+ /*
+ * Actually servers are only supposed to refuse
+ * connection from liblustre clients, so we
+ * should never see this from VFS context
+ */
+ LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s). Client must be recompiled\n",
+ obd2cli_tgt(imp->imp_obd),
+ OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+ OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+ OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+ OBD_OCD_VERSION_FIX(ocd->ocd_version),
+ LUSTRE_VERSION_STRING);
+ ptlrpc_deactivate_import(imp);
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+ }
+ return -EPROTO;
+ }
+
+ ptlrpc_maybe_ping_import_soon(imp);
+
+ CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+ obd2cli_tgt(imp->imp_obd),
+ (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+ }
+
+ wake_up_all(&imp->imp_recovery_waitq);
+ return rc;
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
+ void *data, int rc)
+{
+ atomic_dec(&req->rq_import->imp_replay_inflight);
+ if (req->rq_status == 0 &&
+ !req->rq_import->imp_vbr_failed) {
+ ptlrpc_import_recovery_state_machine(req->rq_import);
+ } else {
+ if (req->rq_import->imp_vbr_failed) {
+ CDEBUG(D_WARNING,
+ "%s: version recovery fails, reconnecting\n",
+ req->rq_import->imp_obd->obd_name);
+ } else {
+ CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n",
+ req->rq_import->imp_obd->obd_name,
+ req->rq_status);
+ }
+ ptlrpc_connect_import(req->rq_import);
+ }
+
+ return 0;
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+ struct ptlrpc_request *req;
+
+ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+ return 0;
+
+ LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+ atomic_inc(&imp->imp_replay_inflight);
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+ OBD_PING);
+ if (req == NULL) {
+ atomic_dec(&imp->imp_replay_inflight);
+ return -ENOMEM;
+ }
+
+ ptlrpc_request_set_replen(req);
+ req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+ lustre_msg_add_flags(req->rq_reqmsg,
+ MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+ if (AT_OFF)
+ req->rq_timeout *= 3;
+ req->rq_interpret_reply = completed_replay_interpret;
+
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+ return 0;
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+ struct obd_import *imp = data;
+
+ unshare_fs_struct();
+
+ CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+ imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid);
+
+ ptlrpc_invalidate_import(imp);
+
+ if (obd_dump_on_eviction) {
+ CERROR("dump the log upon eviction\n");
+ libcfs_debug_dumplog();
+ }
+
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+ ptlrpc_import_recovery_state_machine(imp);
+
+ class_import_put(imp);
+ return 0;
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typically we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+ int rc = 0;
+ int inflight;
+ char *target_start;
+ int target_len;
+
+ if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+ &target_start, &target_len);
+ /* Don't care about MGC eviction */
+ if (strcmp(imp->imp_obd->obd_type->typ_name,
+ LUSTRE_MGC_NAME) != 0) {
+ LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n",
+ imp->imp_obd->obd_name, target_len,
+ target_start);
+ }
+ CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid);
+ /* reset vbr_failed flag upon eviction */
+ spin_lock(&imp->imp_lock);
+ imp->imp_vbr_failed = 0;
+ spin_unlock(&imp->imp_lock);
+
+ {
+ struct task_struct *task;
+ /* bug 17802: XXX client_disconnect_export vs connect request
+ * race. if client will evicted at this time, we start
+ * invalidate thread without reference to import and import can
+ * be freed at same time. */
+ class_import_get(imp);
+ task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+ "ll_imp_inval");
+ if (IS_ERR(task)) {
+ class_import_put(imp);
+ CERROR("error starting invalidate thread: %d\n", rc);
+ rc = PTR_ERR(task);
+ } else {
+ rc = 0;
+ }
+ return rc;
+ }
+ }
+
+ if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+ CDEBUG(D_HA, "replay requested by %s\n",
+ obd2cli_tgt(imp->imp_obd));
+ rc = ptlrpc_replay_next(imp, &inflight);
+ if (inflight == 0 &&
+ atomic_read(&imp->imp_replay_inflight) == 0) {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+ rc = ldlm_replay_locks(imp);
+ if (rc)
+ goto out;
+ }
+ rc = 0;
+ }
+
+ if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+ if (atomic_read(&imp->imp_replay_inflight) == 0) {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+ rc = signal_completed_replay(imp);
+ if (rc)
+ goto out;
+ }
+
+ }
+
+ if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+ if (atomic_read(&imp->imp_replay_inflight) == 0) {
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+ }
+ }
+
+ if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+ CDEBUG(D_HA, "reconnected to %s@%s\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid);
+
+ rc = ptlrpc_resend(imp);
+ if (rc)
+ goto out;
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+ ptlrpc_activate_import(imp);
+
+ deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+ &target_start, &target_len);
+ LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
+ imp->imp_obd->obd_name,
+ target_len, target_start,
+ libcfs_nid2str(imp->imp_connection->c_peer.nid));
+ }
+
+ if (imp->imp_state == LUSTRE_IMP_FULL) {
+ wake_up_all(&imp->imp_recovery_waitq);
+ ptlrpc_wake_delayed(imp);
+ }
+
+out:
+ return rc;
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+ struct ptlrpc_request *req;
+ int rq_opc, rc = 0;
+
+ if (imp->imp_obd->obd_force)
+ goto set_state;
+
+ switch (imp->imp_connect_op) {
+ case OST_CONNECT:
+ rq_opc = OST_DISCONNECT;
+ break;
+ case MDS_CONNECT:
+ rq_opc = MDS_DISCONNECT;
+ break;
+ case MGS_CONNECT:
+ rq_opc = MGS_DISCONNECT;
+ break;
+ default:
+ rc = -EINVAL;
+ CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n",
+ imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+ imp->imp_connect_op, rc);
+ return rc;
+ }
+
+ if (ptlrpc_import_in_recovery(imp)) {
+ struct l_wait_info lwi;
+ long timeout;
+
+ if (AT_OFF) {
+ if (imp->imp_server_timeout)
+ timeout = cfs_time_seconds(obd_timeout / 2);
+ else
+ timeout = cfs_time_seconds(obd_timeout);
+ } else {
+ int idx = import_at_get_index(imp,
+ imp->imp_client->cli_request_portal);
+ timeout = cfs_time_seconds(
+ at_get(&imp->imp_at.iat_service_estimate[idx]));
+ }
+
+ lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+ back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
+ rc = l_wait_event(imp->imp_recovery_waitq,
+ !ptlrpc_import_in_recovery(imp), &lwi);
+
+ }
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state != LUSTRE_IMP_FULL)
+ goto out;
+ spin_unlock(&imp->imp_lock);
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+ LUSTRE_OBD_VERSION, rq_opc);
+ if (req) {
+ /* We are disconnecting, do not retry a failed DISCONNECT rpc if
+ * it fails. We can get through the above with a down server
+ * if the client doesn't know the server is gone yet. */
+ req->rq_no_resend = 1;
+
+ /* We want client umounts to happen quickly, no matter the
+ server state... */
+ req->rq_timeout = min_t(int, req->rq_timeout,
+ INITIAL_CONNECT_TIMEOUT);
+
+ IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+ req->rq_send_state = LUSTRE_IMP_CONNECTING;
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ ptlrpc_req_finished(req);
+ }
+
+set_state:
+ spin_lock(&imp->imp_lock);
+out:
+ if (noclose)
+ IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+ else
+ IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+ memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+ spin_unlock(&imp->imp_lock);
+
+ if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN)
+ rc = 0;
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+ spin_lock(&imp->imp_lock);
+ IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+ imp->imp_generation++;
+ spin_unlock(&imp->imp_lock);
+ ptlrpc_abort_inflight(imp);
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_imp);
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Bin into timeslices using AT_BINS bins.
+ This gives us a max of the last binlimit*AT_BINS secs without the storage,
+ but still smoothing out a return to normalcy from a slow response.
+ (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+int at_measured(struct adaptive_timeout *at, unsigned int val)
+{
+ unsigned int old = at->at_current;
+ time_t now = get_seconds();
+ time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
+
+ LASSERT(at);
+ CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+ val, at, now - at->at_binstart, at->at_current,
+ at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+ if (val == 0)
+ /* 0's don't count, because we never want our timeout to
+ drop to 0, and because 0 could mean an error */
+ return 0;
+
+ spin_lock(&at->at_lock);
+
+ if (unlikely(at->at_binstart == 0)) {
+ /* Special case to remove default from history */
+ at->at_current = val;
+ at->at_worst_ever = val;
+ at->at_worst_time = now;
+ at->at_hist[0] = val;
+ at->at_binstart = now;
+ } else if (now - at->at_binstart < binlimit) {
+ /* in bin 0 */
+ at->at_hist[0] = max(val, at->at_hist[0]);
+ at->at_current = max(val, at->at_current);
+ } else {
+ int i, shift;
+ unsigned int maxv = val;
+ /* move bins over */
+ shift = (now - at->at_binstart) / binlimit;
+ LASSERT(shift > 0);
+ for (i = AT_BINS - 1; i >= 0; i--) {
+ if (i >= shift) {
+ at->at_hist[i] = at->at_hist[i - shift];
+ maxv = max(maxv, at->at_hist[i]);
+ } else {
+ at->at_hist[i] = 0;
+ }
+ }
+ at->at_hist[0] = val;
+ at->at_current = maxv;
+ at->at_binstart += shift * binlimit;
+ }
+
+ if (at->at_current > at->at_worst_ever) {
+ at->at_worst_ever = at->at_current;
+ at->at_worst_time = now;
+ }
+
+ if (at->at_flags & AT_FLG_NOHIST)
+ /* Only keep last reported val; keeping the rest of the history
+ for proc only */
+ at->at_current = val;
+
+ if (at_max > 0)
+ at->at_current = min(at->at_current, at_max);
+ at->at_current = max(at->at_current, at_min);
+
+ if (at->at_current != old)
+ CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n",
+ at,
+ old, at->at_current, at->at_current - old, val,
+ at->at_hist[0], at->at_hist[1], at->at_hist[2],
+ at->at_hist[3]);
+
+ /* if we changed, report the old value */
+ old = (at->at_current != old) ? old : 0;
+
+ spin_unlock(&at->at_lock);
+ return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+ struct imp_at *at = &imp->imp_at;
+ int i;
+
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ if (at->iat_portal[i] == portal)
+ return i;
+ if (at->iat_portal[i] == 0)
+ /* unused */
+ break;
+ }
+
+ /* Not found in list, add it under a lock */
+ spin_lock(&imp->imp_lock);
+
+ /* Check unused under lock */
+ for (; i < IMP_AT_MAX_PORTALS; i++) {
+ if (at->iat_portal[i] == portal)
+ goto out;
+ if (at->iat_portal[i] == 0)
+ /* unused */
+ break;
+ }
+
+ /* Not enough portals? */
+ LASSERT(i < IMP_AT_MAX_PORTALS);
+
+ at->iat_portal[i] = portal;
+out:
+ spin_unlock(&imp->imp_lock);
+ return i;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c
new file mode 100644
index 000000000..a42335e26
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/layout.c
@@ -0,0 +1,2442 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+/* LUSTRE_VERSION_CODE */
+#include "../include/lustre_ver.h"
+
+#include "../include/obd_support.h"
+/* lustre_swab_mdt_body */
+#include "../include/lustre/lustre_idl.h"
+/* obd2cli_tgt() (required by DEBUG_REQ()) */
+#include "../include/obd.h"
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+/* struct ptlrpc_request, lustre_msg* */
+#include "../include/lustre_req_layout.h"
+#include "../include/lustre_acl.h"
+#include "../include/lustre_debug.h"
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+ &RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MGS_TARGET_INFO
+};
+
+static const struct req_msg_field *mgs_set_info[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MGS_SEND_PARAM
+};
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *log_cancel_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_LOGCOOKIES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REP,
+ &RMF_DLM_LVB,
+ &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_EPOCH,
+ &RMF_REC_REINT,
+ &RMF_CAPA1
+};
+
+static const struct req_msg_field *mdt_release_close_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_EPOCH,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_CLOSE_DATA
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_SEQ_OPC,
+ &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_FLD_OPC,
+ &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_CAPA1,
+ &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_NAME,
+ &RMF_EADATA,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_NAME,
+ &RMF_EADATA,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_NAME,
+ &RMF_SYMTGT,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_CAPA2,
+ &RMF_NAME,
+ &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_ACL,
+ &RMF_CAPA1,
+ &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_NAME,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_CAPA2,
+ &RMF_NAME,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_CAPA2,
+ &RMF_NAME,
+ &RMF_SYMTGT,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_LOGCOOKIES,
+ &RMF_CAPA1,
+ &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_MDT_EPOCH,
+ &RMF_EADATA,
+ &RMF_LOGCOOKIES,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_CAPA1,
+ &RMF_NAME,
+ &RMF_EADATA,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_SWAP_LAYOUTS,
+ &RMF_CAPA1,
+ &RMF_CAPA2,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_TGTUUID,
+ &RMF_CLUUID,
+ &RMF_CONN,
+ &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_SETINFO_KEY,
+ &RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_SETINFO_KEY,
+ &RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_GETINFO_KEY,
+ &RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REP,
+ &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REP,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_LAYOUT_INTENT,
+ &RMF_EADATA /* for new layout to be set up */
+};
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REP,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_ACL,
+ &RMF_CAPA1,
+ &RMF_CAPA2
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_MDT_BODY, /* coincides with mds_getattr_name_client[] */
+ &RMF_CAPA1,
+ &RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REP,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_ACL,
+ &RMF_CAPA1
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_REC_REINT, /* coincides with mds_reint_create_client[] */
+ &RMF_CAPA1,
+ &RMF_NAME,
+ &RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_REC_REINT, /* coincides with mds_reint_open_client[] */
+ &RMF_CAPA1,
+ &RMF_CAPA2,
+ &RMF_NAME,
+ &RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_REC_REINT, /* coincides with mds_reint_unlink_client[] */
+ &RMF_CAPA1,
+ &RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REQ,
+ &RMF_LDLM_INTENT,
+ &RMF_MDT_BODY,
+ &RMF_CAPA1,
+};
+
+static const struct req_msg_field *ldlm_intent_getxattr_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_DLM_REP,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */
+ &RMF_EADATA,
+ &RMF_EAVALS,
+ &RMF_EAVALS_LENS
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_CAPA1,
+ &RMF_NAME,
+ &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_ACL,
+ &RMF_CAPA1,
+ &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDT_MD,
+ &RMF_ACL,
+ &RMF_CAPA1,
+ &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_update_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_UPDATE,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_UPDATE_REPLY,
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_LLOGD_BODY,
+ &RMF_NAME
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llogd_conn_body_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_LLOGD_CONN_BODY
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_LLOGD_BODY,
+ &RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OST_BODY,
+ &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OST_BODY,
+ &RMF_DLM_REQ,
+ &RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OST_BODY,
+ &RMF_OBD_IOOBJ,
+ &RMF_NIOBUF_REMOTE,
+ &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OST_BODY,
+ &RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_SETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_FIEMAP_KEY,
+ &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_CAPA1,
+ &RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_MDT_BODY,
+ &RMF_MDS_HSM_REQUEST,
+ &RMF_MDS_HSM_USER_ITEM,
+ &RMF_GENERIC_DATA,
+};
+
+static struct req_format *req_formats[] = {
+ &RQF_OBD_PING,
+ &RQF_OBD_SET_INFO,
+ &RQF_OBD_IDX_READ,
+ &RQF_SEC_CTX,
+ &RQF_MGS_TARGET_REG,
+ &RQF_MGS_SET_INFO,
+ &RQF_MGS_CONFIG_READ,
+ &RQF_SEQ_QUERY,
+ &RQF_FLD_QUERY,
+ &RQF_MDS_CONNECT,
+ &RQF_MDS_DISCONNECT,
+ &RQF_MDS_GET_INFO,
+ &RQF_MDS_GETSTATUS,
+ &RQF_MDS_STATFS,
+ &RQF_MDS_GETATTR,
+ &RQF_MDS_GETATTR_NAME,
+ &RQF_MDS_GETXATTR,
+ &RQF_MDS_SYNC,
+ &RQF_MDS_CLOSE,
+ &RQF_MDS_RELEASE_CLOSE,
+ &RQF_MDS_PIN,
+ &RQF_MDS_UNPIN,
+ &RQF_MDS_READPAGE,
+ &RQF_MDS_WRITEPAGE,
+ &RQF_MDS_IS_SUBDIR,
+ &RQF_MDS_DONE_WRITING,
+ &RQF_MDS_REINT,
+ &RQF_MDS_REINT_CREATE,
+ &RQF_MDS_REINT_CREATE_RMT_ACL,
+ &RQF_MDS_REINT_CREATE_SLAVE,
+ &RQF_MDS_REINT_CREATE_SYM,
+ &RQF_MDS_REINT_OPEN,
+ &RQF_MDS_REINT_UNLINK,
+ &RQF_MDS_REINT_LINK,
+ &RQF_MDS_REINT_RENAME,
+ &RQF_MDS_REINT_SETATTR,
+ &RQF_MDS_REINT_SETXATTR,
+ &RQF_MDS_QUOTACHECK,
+ &RQF_MDS_QUOTACTL,
+ &RQF_MDS_HSM_PROGRESS,
+ &RQF_MDS_HSM_CT_REGISTER,
+ &RQF_MDS_HSM_CT_UNREGISTER,
+ &RQF_MDS_HSM_STATE_GET,
+ &RQF_MDS_HSM_STATE_SET,
+ &RQF_MDS_HSM_ACTION,
+ &RQF_MDS_HSM_REQUEST,
+ &RQF_MDS_SWAP_LAYOUTS,
+ &RQF_UPDATE_OBJ,
+ &RQF_QC_CALLBACK,
+ &RQF_OST_CONNECT,
+ &RQF_OST_DISCONNECT,
+ &RQF_OST_QUOTACHECK,
+ &RQF_OST_QUOTACTL,
+ &RQF_OST_GETATTR,
+ &RQF_OST_SETATTR,
+ &RQF_OST_CREATE,
+ &RQF_OST_PUNCH,
+ &RQF_OST_SYNC,
+ &RQF_OST_DESTROY,
+ &RQF_OST_BRW_READ,
+ &RQF_OST_BRW_WRITE,
+ &RQF_OST_STATFS,
+ &RQF_OST_SET_GRANT_INFO,
+ &RQF_OST_GET_INFO_GENERIC,
+ &RQF_OST_GET_INFO_LAST_ID,
+ &RQF_OST_GET_INFO_LAST_FID,
+ &RQF_OST_SET_INFO_LAST_FID,
+ &RQF_OST_GET_INFO_FIEMAP,
+ &RQF_LDLM_ENQUEUE,
+ &RQF_LDLM_ENQUEUE_LVB,
+ &RQF_LDLM_CONVERT,
+ &RQF_LDLM_CANCEL,
+ &RQF_LDLM_CALLBACK,
+ &RQF_LDLM_CP_CALLBACK,
+ &RQF_LDLM_BL_CALLBACK,
+ &RQF_LDLM_GL_CALLBACK,
+ &RQF_LDLM_GL_DESC_CALLBACK,
+ &RQF_LDLM_INTENT,
+ &RQF_LDLM_INTENT_BASIC,
+ &RQF_LDLM_INTENT_LAYOUT,
+ &RQF_LDLM_INTENT_GETATTR,
+ &RQF_LDLM_INTENT_OPEN,
+ &RQF_LDLM_INTENT_CREATE,
+ &RQF_LDLM_INTENT_UNLINK,
+ &RQF_LDLM_INTENT_GETXATTR,
+ &RQF_LDLM_INTENT_QUOTA,
+ &RQF_QUOTA_DQACQ,
+ &RQF_LOG_CANCEL,
+ &RQF_LLOG_ORIGIN_HANDLE_CREATE,
+ &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+ &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+ &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+ &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+ &RQF_LLOG_ORIGIN_CONNECT,
+ &RQF_CONNECT,
+};
+
+struct req_msg_field {
+ const __u32 rmf_flags;
+ const char *rmf_name;
+ /**
+ * Field length. (-1) means "variable length". If the
+ * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+ * but the actual size must be a whole multiple of \a rmf_size.
+ */
+ const int rmf_size;
+ void (*rmf_swabber)(void *);
+ void (*rmf_dumper)(void *);
+ int rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+ /**
+ * The field is a string, must be NUL-terminated.
+ */
+ RMF_F_STRING = 1 << 0,
+ /**
+ * The field's buffer size need not match the declared \a rmf_size.
+ */
+ RMF_F_NO_SIZE_CHECK = 1 << 1,
+ /**
+ * The field's buffer size must be a whole multiple of the declared \a
+ * rmf_size and the \a rmf_swabber function must work on the declared \a
+ * rmf_size worth of bytes.
+ */
+ RMF_F_STRUCT_ARRAY = 1 << 2
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) { \
+ .rmf_name = (name), \
+ .rmf_flags = (flags), \
+ .rmf_size = (size), \
+ .rmf_swabber = (void (*)(void *))(swabber), \
+ .rmf_dumper = (void (*)(void *))(dumper) \
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+ DEFINE_MSGF("generic_data", 0,
+ -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+ DEFINE_MSGF("mgs_target_info", 0,
+ sizeof(struct mgs_target_info),
+ lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+struct req_msg_field RMF_MGS_SEND_PARAM =
+ DEFINE_MSGF("mgs_send_param", 0,
+ sizeof(struct mgs_send_param),
+ NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+ DEFINE_MSGF("mgs_config_read request", 0,
+ sizeof(struct mgs_config_body),
+ lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+ DEFINE_MSGF("mgs_config_read reply ", 0,
+ sizeof(struct mgs_config_res),
+ lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+ DEFINE_MSGF("generic u32", 0,
+ sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+ DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+ DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+ DEFINE_MSGF("getinfo_vallen", 0,
+ sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+ DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+ DEFINE_MSGF("seq_query_opc", 0,
+ sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+ DEFINE_MSGF("seq_query_range", 0,
+ sizeof(struct lu_seq_range),
+ lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+ DEFINE_MSGF("fld_query_opc", 0,
+ sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+ DEFINE_MSGF("fld_query_mdfld", 0,
+ sizeof(struct lu_seq_range),
+ lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+ DEFINE_MSGF("mdt_body", 0,
+ sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+ DEFINE_MSGF("obd_quotactl", 0,
+ sizeof(struct obd_quotactl),
+ lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+ DEFINE_MSGF("quota_body", 0,
+ sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+ DEFINE_MSGF("mdt_ioepoch", 0,
+ sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+ DEFINE_MSGF("ptlrpc_body", 0,
+ sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_CLOSE_DATA =
+ DEFINE_MSGF("data_version", 0,
+ sizeof(struct close_data), lustre_swab_close_data, NULL);
+EXPORT_SYMBOL(RMF_CLOSE_DATA);
+
+struct req_msg_field RMF_OBD_STATFS =
+ DEFINE_MSGF("obd_statfs", 0,
+ sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+ DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+ DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_SYMTGT =
+ DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+ DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+ NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+ DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+ NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+ DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_LLOGD_BODY =
+ DEFINE_MSGF("llogd_body", 0,
+ sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+ DEFINE_MSGF("llog_log_hdr", 0,
+ sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+ DEFINE_MSGF("llogd_conn_body", 0,
+ sizeof(struct llogd_conn_body),
+ lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+ DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+ DEFINE_MSGF("cdata",
+ RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+ sizeof(struct obd_connect_data),
+ lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+ DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+ sizeof(struct ldlm_request),
+ lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+ DEFINE_MSGF("dlm_rep", 0,
+ sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+ DEFINE_MSGF("ldlm_intent", 0,
+ sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+ DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+ DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc),
+ lustre_swab_gl_desc, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+ DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+ DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+ lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+ NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_EAVALS);
+
+struct req_msg_field RMF_ACL =
+ DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK,
+ LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+ DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+ sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+ DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+ lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+ DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+ lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+ DEFINE_MSGF("layout_intent", 0,
+ sizeof(struct layout_intent), lustre_swab_layout_intent,
+ NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+ DEFINE_MSGF("ost_body", 0,
+ sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+ DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+ sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+ DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+ sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+ dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_RCS =
+ DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+ lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_EAVALS_LENS =
+ DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+ lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_EAVALS_LENS);
+
+struct req_msg_field RMF_OBD_ID =
+ DEFINE_MSGF("u64", 0,
+ sizeof(u64), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+ DEFINE_MSGF("fid", 0,
+ sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+ DEFINE_MSGF("ost_id", 0,
+ sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+ DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+ lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+ DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+ DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+ lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_HSM_USER_STATE =
+ DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+ lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+ DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+ lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+ DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+ lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+ DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+ lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+ DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+ sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+ NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+ DEFINE_MSGF("hsm_archive", 0,
+ sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+ DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+ lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1,
+ lustre_swab_update_buf, NULL);
+EXPORT_SYMBOL(RMF_UPDATE);
+
+struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1,
+ lustre_swab_update_reply_buf,
+ NULL);
+EXPORT_SYMBOL(RMF_UPDATE_REPLY);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+ DEFINE_MSGF("swap_layouts", 0, sizeof(struct mdc_swap_layouts),
+ lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+/*
+ * Request formats.
+ */
+
+struct req_format {
+ const char *rf_name;
+ int rf_idx;
+ struct {
+ int nr;
+ const struct req_msg_field **d;
+ } rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) { \
+ .rf_name = name, \
+ .rf_fields = { \
+ [RCL_CLIENT] = { \
+ .nr = client_nr, \
+ .d = client \
+ }, \
+ [RCL_SERVER] = { \
+ .nr = server_nr, \
+ .d = server \
+ } \
+ } \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server) \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+ DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+ DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+ DEFINE_REQ_FMT0("OBD_IDX_READ",
+ obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+ DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+ DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+ mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+struct req_format RQF_MGS_SET_INFO =
+ DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+ mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+
+struct req_format RQF_MGS_CONFIG_READ =
+ DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+ mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+ DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+ DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+struct req_format RQF_LOG_CANCEL =
+ DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
+struct req_format RQF_MDS_QUOTACHECK =
+ DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_MDS_QUOTACHECK);
+
+struct req_format RQF_OST_QUOTACHECK =
+ DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_OST_QUOTACHECK);
+
+struct req_format RQF_MDS_QUOTACTL =
+ DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+ DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QC_CALLBACK =
+ DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_QC_CALLBACK);
+
+struct req_format RQF_QUOTA_DQACQ =
+ DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+ DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+ ldlm_intent_quota_client,
+ ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GETSTATUS =
+ DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GETSTATUS);
+
+struct req_format RQF_MDS_STATFS =
+ DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_SYNC =
+ DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+ DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+ DEFINE_REQ_FMT0("MDS_GETXATTR",
+ mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+ DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+ mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+ DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+ DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+ mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_RMT_ACL =
+ DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL",
+ mds_reint_create_rmt_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+ DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+ mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+ DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+ mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+ DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+ mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+ DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+ mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+ DEFINE_REQ_FMT0("MDS_REINT_LINK",
+ mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+ DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+ mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+ DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+ mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+ DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+ mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_CONNECT =
+ DEFINE_REQ_FMT0("MDS_CONNECT",
+ obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+ DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+ DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+ mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_UPDATE_OBJ =
+ DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client,
+ mds_update_server);
+EXPORT_SYMBOL(RQF_UPDATE_OBJ);
+
+struct req_format RQF_LDLM_ENQUEUE =
+ DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+ ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+ DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+ ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+ DEFINE_REQ_FMT0("LDLM_CONVERT",
+ ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+ DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+ DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+ DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+ DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+ DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+ ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+ DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+ ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+ DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+ ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+ DEFINE_REQ_FMT0("LDLM_INTENT",
+ ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+ DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+ ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+ DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+ ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+ DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+ ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+ DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+ ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_UNLINK =
+ DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+ ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
+struct req_format RQF_LDLM_INTENT_GETXATTR =
+ DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR",
+ ldlm_intent_getxattr_client,
+ ldlm_intent_getxattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR);
+
+struct req_format RQF_MDS_CLOSE =
+ DEFINE_REQ_FMT0("MDS_CLOSE",
+ mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_RELEASE_CLOSE =
+ DEFINE_REQ_FMT0("MDS_CLOSE",
+ mdt_release_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_RELEASE_CLOSE);
+
+struct req_format RQF_MDS_PIN =
+ DEFINE_REQ_FMT0("MDS_PIN",
+ mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_PIN);
+
+struct req_format RQF_MDS_UNPIN =
+ DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty);
+EXPORT_SYMBOL(RQF_MDS_UNPIN);
+
+struct req_format RQF_MDS_DONE_WRITING =
+ DEFINE_REQ_FMT0("MDS_DONE_WRITING",
+ mdt_close_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_DONE_WRITING);
+
+struct req_format RQF_MDS_READPAGE =
+ DEFINE_REQ_FMT0("MDS_READPAGE",
+ mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+ DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+ DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+ DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+ DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+ DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+ mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+ DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+ DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+ DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+ mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+/* This is for split */
+struct req_format RQF_MDS_WRITEPAGE =
+ DEFINE_REQ_FMT0("MDS_WRITEPAGE",
+ mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_WRITEPAGE);
+
+struct req_format RQF_MDS_IS_SUBDIR =
+ DEFINE_REQ_FMT0("MDS_IS_SUBDIR",
+ mdt_body_only, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+ DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+ llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+ DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+ llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+ DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+ llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+ DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+ llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+ DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+ llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+ DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
+struct req_format RQF_CONNECT =
+ DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+ DEFINE_REQ_FMT0("OST_CONNECT",
+ obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+ DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+ DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+ DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+ DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+ DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_SYNC =
+ DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+ DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+ DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+ DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+ DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+ DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+ ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO_GENERIC =
+ DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+ ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+ DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+ ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+ DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client,
+ ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+ DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+ empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+ DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+ ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+ int i;
+ int j;
+ int k;
+ struct req_format *rf = NULL;
+
+ for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+ rf = req_formats[i];
+ rf->rf_idx = i;
+ for (j = 0; j < RCL_NR; ++j) {
+ LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+ for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+ struct req_msg_field *field;
+
+ field = (typeof(field))rf->rf_fields[j].d[k];
+ LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+ || field->rmf_size > 0);
+ LASSERT(field->rmf_offset[i][j] == 0);
+ /*
+ * k + 1 to detect unused format/field
+ * combinations.
+ */
+ field->rmf_offset[i][j] = k + 1;
+ }
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size(). The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+ pill->rc_area[RCL_CLIENT][i] = -1;
+ pill->rc_area[RCL_SERVER][i] = -1;
+ }
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+ struct ptlrpc_request *req,
+ enum req_location location)
+{
+ LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+ /*
+ * Today all capsules are embedded in ptlrpc_request structs,
+ * but just in case that ever isn't the case, we don't reach
+ * into req unless req != NULL and pill is the one embedded in
+ * the req.
+ *
+ * The req->rq_pill_init flag makes it safe to initialize a pill
+ * twice, which might happen in the OST paths as a result of the
+ * high-priority RPC queue getting peeked at before ost_handle()
+ * handles an OST RPC.
+ */
+ if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+ return;
+
+ memset(pill, 0, sizeof(*pill));
+ pill->rc_req = req;
+ pill->rc_loc = location;
+ req_capsule_init_area(pill);
+
+ if (req != NULL && pill == &req->rq_pill)
+ req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+ return
+ 0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+ req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+ enum req_location loc)
+{
+ struct ptlrpc_request *req;
+
+ req = pill->rc_req;
+ return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+ LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+ LASSERT(__req_format_is_sane(fmt));
+
+ pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields. The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+int req_capsule_filled_sizes(struct req_capsule *pill,
+ enum req_location loc)
+{
+ const struct req_format *fmt = pill->rc_fmt;
+ int i;
+
+ LASSERT(fmt != NULL);
+
+ for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+ if (pill->rc_area[loc][i] == -1) {
+ pill->rc_area[loc][i] =
+ fmt->rf_fields[loc].d[i]->rmf_size;
+ if (pill->rc_area[loc][i] == -1) {
+ /*
+ * Skip the following fields.
+ *
+ * If this LASSERT() trips then you're missing a
+ * call to req_capsule_set_size().
+ */
+ LASSERT(loc != RCL_SERVER);
+ break;
+ }
+ }
+ }
+ return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+ const struct req_format *fmt;
+ int count;
+ int rc;
+
+ LASSERT(pill->rc_loc == RCL_SERVER);
+ fmt = pill->rc_fmt;
+ LASSERT(fmt != NULL);
+
+ count = req_capsule_filled_sizes(pill, RCL_SERVER);
+ rc = lustre_pack_reply(pill->rc_req, count,
+ pill->rc_area[RCL_SERVER], NULL);
+ if (rc != 0) {
+ DEBUG_REQ(D_ERROR, pill->rc_req,
+ "Cannot pack %d fields in format `%s': ",
+ count, fmt->rf_name);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+static int __req_capsule_offset(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc)
+{
+ int offset;
+
+ offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+ LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+ pill->rc_fmt->rf_name,
+ field->rmf_name, offset, loc);
+ offset--;
+
+ LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR);
+ return offset;
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static
+void
+swabber_dumper_helper(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc,
+ int offset,
+ void *value, int len, int dump, void (*swabber)(void *))
+{
+ void *p;
+ int i;
+ int n;
+ int do_swab;
+ int inout = loc == RCL_CLIENT;
+
+ swabber = swabber ?: field->rmf_swabber;
+
+ if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) &&
+ swabber != NULL && value != NULL)
+ do_swab = 1;
+ else
+ do_swab = 0;
+
+ if (!field->rmf_dumper)
+ dump = 0;
+
+ if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) {
+ if (dump) {
+ CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n",
+ do_swab ? "unswabbed " : "", field->rmf_name);
+ field->rmf_dumper(value);
+ }
+ if (!do_swab)
+ return;
+ swabber(value);
+ ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+ if (dump) {
+ CDEBUG(D_RPCTRACE, "Dump of swabbed field %s follows\n",
+ field->rmf_name);
+ field->rmf_dumper(value);
+ }
+
+ return;
+ }
+
+ /*
+ * We're swabbing an array; swabber() swabs a single array element, so
+ * swab every element.
+ */
+ LASSERT((len % field->rmf_size) == 0);
+ for (p = value, i = 0, n = len / field->rmf_size;
+ i < n;
+ i++, p += field->rmf_size) {
+ if (dump) {
+ CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, element %d follows\n",
+ do_swab ? "unswabbed " : "", field->rmf_name, i);
+ field->rmf_dumper(p);
+ }
+ if (!do_swab)
+ continue;
+ swabber(p);
+ if (dump) {
+ CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, element %d follows\n",
+ field->rmf_name, i);
+ field->rmf_dumper(value);
+ }
+ }
+ if (do_swab)
+ ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber. If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used. Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed. Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc,
+ void (*swabber)(void *),
+ int dump)
+{
+ const struct req_format *fmt;
+ struct lustre_msg *msg;
+ void *value;
+ int len;
+ int offset;
+
+ void *(*getter)(struct lustre_msg *m, int n, int minlen);
+
+ static const char *rcl_names[RCL_NR] = {
+ [RCL_CLIENT] = "client",
+ [RCL_SERVER] = "server"
+ };
+
+ LASSERT(pill != NULL);
+ LASSERT(pill != LP_POISON);
+ fmt = pill->rc_fmt;
+ LASSERT(fmt != NULL);
+ LASSERT(fmt != LP_POISON);
+ LASSERT(__req_format_is_sane(fmt));
+
+ offset = __req_capsule_offset(pill, field, loc);
+
+ msg = __req_msg(pill, loc);
+ LASSERT(msg != NULL);
+
+ getter = (field->rmf_flags & RMF_F_STRING) ?
+ (typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+ if (field->rmf_flags & RMF_F_STRUCT_ARRAY) {
+ /*
+ * We've already asserted that field->rmf_size > 0 in
+ * req_layout_init().
+ */
+ len = lustre_msg_buflen(msg, offset);
+ if ((len % field->rmf_size) != 0) {
+ CERROR("%s: array field size mismatch %d modulo %d != 0 (%d)\n",
+ field->rmf_name, len, field->rmf_size, loc);
+ return NULL;
+ }
+ } else if (pill->rc_area[loc][offset] != -1) {
+ len = pill->rc_area[loc][offset];
+ } else {
+ len = max(field->rmf_size, 0);
+ }
+ value = getter(msg, offset, len);
+
+ if (value == NULL) {
+ DEBUG_REQ(D_ERROR, pill->rc_req,
+ "Wrong buffer for field `%s' (%d of %d) in format `%s': %d vs. %d (%s)\n",
+ field->rmf_name, offset, lustre_msg_bufcount(msg),
+ fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+ rcl_names[loc]);
+ } else {
+ swabber_dumper_helper(pill, field, loc, offset, value, len,
+ dump, swabber);
+ }
+
+ return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+static void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+ const struct req_format *fmt;
+ const struct req_msg_field *field;
+ int len;
+ int i;
+
+ fmt = pill->rc_fmt;
+
+ DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n");
+ for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+ field = FMT_FIELD(fmt, loc, i);
+ if (field->rmf_dumper == NULL) {
+ /*
+ * FIXME Add a default hex dumper for fields that don't
+ * have a specific dumper
+ */
+ len = req_capsule_get_size(pill, field, loc);
+ CDEBUG(D_RPCTRACE, "Field %s has no dumper function; field size is %d\n",
+ field->rmf_name, len);
+ } else {
+ /* It's the dumping side-effect that we're interested in */
+ (void) __req_capsule_get(pill, field, loc, NULL, 1);
+ }
+ }
+ CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+ __req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+ __req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+ const struct req_msg_field *field)
+{
+ return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ void *swabber)
+{
+ return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len. Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len)
+{
+ req_capsule_set_size(pill, field, RCL_CLIENT, len);
+ return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+ const struct req_msg_field *field)
+{
+ return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ void *swabber)
+{
+ return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len. Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len)
+{
+ req_capsule_set_size(pill, field, RCL_SERVER, len);
+ return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ int len, void *swabber)
+{
+ req_capsule_set_size(pill, field, RCL_SERVER, len);
+ return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convenient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+ const struct req_msg_field *field)
+{
+ return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc, int size)
+{
+ LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+ if ((size != field->rmf_size) &&
+ (field->rmf_size != -1) &&
+ !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+ (size > 0)) {
+ if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+ (size % field->rmf_size != 0)) {
+ CERROR("%s: array field size mismatch %d %% %d != 0 (%d)\n",
+ field->rmf_name, size, field->rmf_size, loc);
+ LBUG();
+ } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+ size < field->rmf_size) {
+ CERROR("%s: field size mismatch %d != %d (%d)\n",
+ field->rmf_name, size, field->rmf_size, loc);
+ LBUG();
+ }
+ }
+
+ pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+int req_capsule_get_size(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc)
+{
+ LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+ return lustre_msg_buflen(__req_msg(pill, loc),
+ __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+ return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+ pill->rc_fmt->rf_fields[loc].nr,
+ pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+ enum req_location loc)
+{
+ int size, i = 0;
+
+ /*
+ * This function should probably LASSERT() that fmt has no fields with
+ * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+ * elements in the array there will ultimately be, but then, we could
+ * assume that there will be at least one element, and that's just what
+ * we do.
+ */
+ size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+ if (size < 0)
+ return size;
+
+ for (; i < fmt->rf_fields[loc].nr; ++i)
+ if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+ size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+ rmf_size);
+ return size;
+}
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format. The new format \a fmt must be an extension of the pill's
+ * old format. Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK. For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field. When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+ int i;
+ int j;
+
+ const struct req_format *old;
+
+ LASSERT(pill->rc_fmt != NULL);
+ LASSERT(__req_format_is_sane(fmt));
+
+ old = pill->rc_fmt;
+ /*
+ * Sanity checking...
+ */
+ for (i = 0; i < RCL_NR; ++i) {
+ LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+ for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+ const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+ /* "opaque" fields can be transmogrified */
+ if (ofield->rmf_swabber == NULL &&
+ (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+ (ofield->rmf_size == -1 ||
+ ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+ continue;
+ LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+ }
+ /*
+ * Last field in old format can be shorter than in new.
+ */
+ LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+ FMT_FIELD(old, i, j)->rmf_size);
+ }
+
+ pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc)
+{
+ LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+ return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+ const struct req_msg_field *field,
+ enum req_location loc)
+{
+ int offset;
+
+ LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+ LASSERT(req_capsule_has_field(pill, field, loc));
+
+ offset = __req_capsule_offset(pill, field, loc);
+ return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ unsigned int newlen,
+ enum req_location loc)
+{
+ const struct req_format *fmt;
+ struct lustre_msg *msg;
+ int len;
+ int offset;
+
+ fmt = pill->rc_fmt;
+ LASSERT(fmt != NULL);
+ LASSERT(__req_format_is_sane(fmt));
+ LASSERT(req_capsule_has_field(pill, field, loc));
+ LASSERT(req_capsule_field_present(pill, field, loc));
+
+ offset = __req_capsule_offset(pill, field, loc);
+
+ msg = __req_msg(pill, loc);
+ len = lustre_msg_buflen(msg, offset);
+ LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n",
+ fmt->rf_name, field->rmf_name, len, newlen);
+
+ if (loc == RCL_CLIENT)
+ pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+ 1);
+ else
+ pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+ 1);
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+ const struct req_msg_field *field,
+ unsigned int newlen)
+{
+ struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+ char *from, *to;
+ int offset, len, rc;
+
+ LASSERT(pill->rc_fmt != NULL);
+ LASSERT(__req_format_is_sane(pill->rc_fmt));
+ LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+ LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+ len = req_capsule_get_size(pill, field, RCL_SERVER);
+ offset = __req_capsule_offset(pill, field, RCL_SERVER);
+ if (pill->rc_req->rq_repbuf_len >=
+ lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen)
+ CERROR("Inplace repack might be done\n");
+
+ pill->rc_req->rq_reply_state = NULL;
+ req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+ rc = req_capsule_server_pack(pill);
+ if (rc) {
+ /* put old rs back, the caller will decide what to do */
+ pill->rc_req->rq_reply_state = rs;
+ return rc;
+ }
+ nrs = pill->rc_req->rq_reply_state;
+ /* Now we need only buffers, copy first chunk */
+ to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+ from = lustre_msg_buf(rs->rs_msg, 0, 0);
+ len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from;
+ memcpy(to, from, len);
+ /* check if we have tail and copy it too */
+ if (rs->rs_msg->lm_bufcount > offset + 1) {
+ to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0);
+ from = lustre_msg_buf(rs->rs_msg, offset + 1, 0);
+ offset = rs->rs_msg->lm_bufcount - 1;
+ len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) +
+ cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from;
+ memcpy(to, from, len);
+ }
+ /* drop old reply if everything is fine */
+ if (rs->rs_difficult) {
+ /* copy rs data */
+ int i;
+
+ nrs->rs_difficult = 1;
+ nrs->rs_no_ack = rs->rs_no_ack;
+ for (i = 0; i < rs->rs_nlocks; i++) {
+ nrs->rs_locks[i] = rs->rs_locks[i];
+ nrs->rs_modes[i] = rs->rs_modes[i];
+ nrs->rs_nlocks++;
+ }
+ rs->rs_nlocks = 0;
+ rs->rs_difficult = 0;
+ rs->rs_no_ack = 0;
+ }
+ ptlrpc_rs_decref(rs);
+ return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
+/* __REQ_LAYOUT_USER__ */
+#endif
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
new file mode 100644
index 000000000..e9baf5bbe
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
@@ -0,0 +1,366 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include "../include/lustre_net.h"
+#include <linux/list.h>
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do { \
+ mutex_lock(&ctxt->loc_mutex); \
+ if (ctxt->loc_imp) { \
+ imp = class_import_get(ctxt->loc_imp); \
+ } else { \
+ CERROR("ctxt->loc_imp == NULL for context idx %d." \
+ "Unable to complete MDS/OSS recovery," \
+ "but I'll try again next time. Not fatal.\n", \
+ ctxt->loc_idx); \
+ imp = NULL; \
+ mutex_unlock(&ctxt->loc_mutex); \
+ return (-EINVAL); \
+ } \
+ mutex_unlock(&ctxt->loc_mutex); \
+} while (0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do { \
+ mutex_lock(&ctxt->loc_mutex); \
+ if (ctxt->loc_imp != imp) \
+ CWARN("loc_imp has changed from %p to %p\n", \
+ ctxt->loc_imp, imp); \
+ class_import_put(imp); \
+ mutex_unlock(&ctxt->loc_mutex); \
+} while (0)
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_client_open(const struct lu_env *env,
+ struct llog_handle *lgh, struct llog_logid *logid,
+ char *name, enum llog_open_param open_param)
+{
+ struct obd_import *imp;
+ struct llogd_body *body;
+ struct llog_ctxt *ctxt = lgh->lgh_ctxt;
+ struct ptlrpc_request *req = NULL;
+ int rc;
+
+ LLOG_CLIENT_ENTRY(ctxt, imp);
+
+ /* client cannot create llog */
+ LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+ LASSERT(lgh);
+
+ req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (name)
+ req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+ strlen(name) + 1);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_CREATE);
+ if (rc) {
+ ptlrpc_request_free(req);
+ req = NULL;
+ goto out;
+ }
+ ptlrpc_request_set_replen(req);
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ if (logid)
+ body->lgd_logid = *logid;
+ body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+ if (name) {
+ char *tmp;
+ tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+ strlen(name) + 1);
+ LASSERT(tmp);
+ strcpy(tmp, name);
+ }
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ if (body == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ lgh->lgh_id = body->lgd_logid;
+ lgh->lgh_ctxt = ctxt;
+out:
+ LLOG_CLIENT_EXIT(ctxt, imp);
+ ptlrpc_req_finished(req);
+ return rc;
+}
+
+static int llog_client_destroy(const struct lu_env *env,
+ struct llog_handle *loghandle)
+{
+ struct obd_import *imp;
+ struct ptlrpc_request *req = NULL;
+ struct llogd_body *body;
+ int rc;
+
+ LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+ req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+ LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_DESTROY);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto err_exit;
+ }
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ body->lgd_logid = loghandle->lgh_id;
+ body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+ if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+ CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+ body->lgd_llh_flags);
+
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+
+ ptlrpc_req_finished(req);
+err_exit:
+ LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+ return rc;
+}
+
+
+static int llog_client_next_block(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ int *cur_idx, int next_idx,
+ __u64 *cur_offset, void *buf, int len)
+{
+ struct obd_import *imp;
+ struct ptlrpc_request *req = NULL;
+ struct llogd_body *body;
+ void *ptr;
+ int rc;
+
+ LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+ req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+ LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto err_exit;
+ }
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ body->lgd_logid = loghandle->lgh_id;
+ body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+ body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+ body->lgd_index = next_idx;
+ body->lgd_saved_index = *cur_idx;
+ body->lgd_len = len;
+ body->lgd_cur_offset = *cur_offset;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ if (body == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ /* The log records are swabbed as they are processed */
+ ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+ if (ptr == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ *cur_idx = body->lgd_saved_index;
+ *cur_offset = body->lgd_cur_offset;
+
+ memcpy(buf, ptr, len);
+out:
+ ptlrpc_req_finished(req);
+err_exit:
+ LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+ return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ struct obd_import *imp;
+ struct ptlrpc_request *req = NULL;
+ struct llogd_body *body;
+ void *ptr;
+ int rc;
+
+ LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+ req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+ LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto err_exit;
+ }
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ body->lgd_logid = loghandle->lgh_id;
+ body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+ body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+ body->lgd_index = prev_idx;
+ body->lgd_len = len;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+ ptlrpc_request_set_replen(req);
+
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ if (body == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+ if (ptr == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ memcpy(buf, ptr, len);
+out:
+ ptlrpc_req_finished(req);
+err_exit:
+ LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+ return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ struct obd_import *imp;
+ struct ptlrpc_request *req = NULL;
+ struct llogd_body *body;
+ struct llog_log_hdr *hdr;
+ struct llog_rec_hdr *llh_hdr;
+ int rc;
+
+ LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+ req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+ LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_READ_HEADER);
+ if (req == NULL) {
+ rc = -ENOMEM;
+ goto err_exit;
+ }
+
+ body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+ body->lgd_logid = handle->lgh_id;
+ body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+ body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+ ptlrpc_request_set_replen(req);
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ goto out;
+
+ hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+ if (hdr == NULL) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ memcpy(handle->lgh_hdr, hdr, sizeof(*hdr));
+ handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+ /* sanity checks */
+ llh_hdr = &handle->lgh_hdr->llh_hdr;
+ if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+ CERROR("bad log header magic: %#x (expecting %#x)\n",
+ llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+ rc = -EIO;
+ } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+ CERROR("incorrectly sized log header: %#x (expecting %#x)\n",
+ llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+ CERROR("you may need to re-run lconf --write_conf.\n");
+ rc = -EIO;
+ }
+out:
+ ptlrpc_req_finished(req);
+err_exit:
+ LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+ return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+ the servers all close the file at the end of every
+ other LLOG_ RPC. */
+ return 0;
+}
+
+struct llog_operations llog_client_ops = {
+ .lop_next_block = llog_client_next_block,
+ .lop_prev_block = llog_client_prev_block,
+ .lop_read_header = llog_client_read_header,
+ .lop_open = llog_client_open,
+ .lop_destroy = llog_client_destroy,
+ .lop_close = llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
new file mode 100644
index 000000000..dac66f5b3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ * if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_class.h"
+#include "../include/lustre_log.h"
+#include <linux/list.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+ struct obd_import *new_imp;
+
+ LASSERT(ctxt);
+ new_imp = ctxt->loc_obd->u.cli.cl_import;
+ LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp,
+ "%p - %p\n", ctxt->loc_imp, new_imp);
+ mutex_lock(&ctxt->loc_mutex);
+ if (ctxt->loc_imp != new_imp) {
+ if (ctxt->loc_imp)
+ class_import_put(ctxt->loc_imp);
+ ctxt->loc_imp = class_import_get(new_imp);
+ }
+ mutex_unlock(&ctxt->loc_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(llog_initiator_connect);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644
index 000000000..9533ab976
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
@@ -0,0 +1,1366 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include "../include/obd_support.h"
+#include "../include/obd.h"
+#include "../include/lprocfs_status.h"
+#include "../include/lustre/lustre_idl.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_class.h"
+#include "ptlrpc_internal.h"
+
+
+static struct ll_rpc_opcode {
+ __u32 opcode;
+ const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+ { OST_REPLY, "ost_reply" },
+ { OST_GETATTR, "ost_getattr" },
+ { OST_SETATTR, "ost_setattr" },
+ { OST_READ, "ost_read" },
+ { OST_WRITE, "ost_write" },
+ { OST_CREATE , "ost_create" },
+ { OST_DESTROY, "ost_destroy" },
+ { OST_GET_INFO, "ost_get_info" },
+ { OST_CONNECT, "ost_connect" },
+ { OST_DISCONNECT, "ost_disconnect" },
+ { OST_PUNCH, "ost_punch" },
+ { OST_OPEN, "ost_open" },
+ { OST_CLOSE, "ost_close" },
+ { OST_STATFS, "ost_statfs" },
+ { 14, NULL }, /* formerly OST_SAN_READ */
+ { 15, NULL }, /* formerly OST_SAN_WRITE */
+ { OST_SYNC, "ost_sync" },
+ { OST_SET_INFO, "ost_set_info" },
+ { OST_QUOTACHECK, "ost_quotacheck" },
+ { OST_QUOTACTL, "ost_quotactl" },
+ { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+ { MDS_GETATTR, "mds_getattr" },
+ { MDS_GETATTR_NAME, "mds_getattr_lock" },
+ { MDS_CLOSE, "mds_close" },
+ { MDS_REINT, "mds_reint" },
+ { MDS_READPAGE, "mds_readpage" },
+ { MDS_CONNECT, "mds_connect" },
+ { MDS_DISCONNECT, "mds_disconnect" },
+ { MDS_GETSTATUS, "mds_getstatus" },
+ { MDS_STATFS, "mds_statfs" },
+ { MDS_PIN, "mds_pin" },
+ { MDS_UNPIN, "mds_unpin" },
+ { MDS_SYNC, "mds_sync" },
+ { MDS_DONE_WRITING, "mds_done_writing" },
+ { MDS_SET_INFO, "mds_set_info" },
+ { MDS_QUOTACHECK, "mds_quotacheck" },
+ { MDS_QUOTACTL, "mds_quotactl" },
+ { MDS_GETXATTR, "mds_getxattr" },
+ { MDS_SETXATTR, "mds_setxattr" },
+ { MDS_WRITEPAGE, "mds_writepage" },
+ { MDS_IS_SUBDIR, "mds_is_subdir" },
+ { MDS_GET_INFO, "mds_get_info" },
+ { MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+ { MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+ { MDS_HSM_ACTION, "mds_hsm_action" },
+ { MDS_HSM_PROGRESS, "mds_hsm_progress" },
+ { MDS_HSM_REQUEST, "mds_hsm_request" },
+ { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+ { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+ { MDS_SWAP_LAYOUTS, "mds_swap_layouts" },
+ { LDLM_ENQUEUE, "ldlm_enqueue" },
+ { LDLM_CONVERT, "ldlm_convert" },
+ { LDLM_CANCEL, "ldlm_cancel" },
+ { LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+ { LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+ { LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+ { LDLM_SET_INFO, "ldlm_set_info" },
+ { MGS_CONNECT, "mgs_connect" },
+ { MGS_DISCONNECT, "mgs_disconnect" },
+ { MGS_EXCEPTION, "mgs_exception" },
+ { MGS_TARGET_REG, "mgs_target_reg" },
+ { MGS_TARGET_DEL, "mgs_target_del" },
+ { MGS_SET_INFO, "mgs_set_info" },
+ { MGS_CONFIG_READ, "mgs_config_read" },
+ { OBD_PING, "obd_ping" },
+ { OBD_LOG_CANCEL, "llog_cancel" },
+ { OBD_QC_CALLBACK, "obd_quota_callback" },
+ { OBD_IDX_READ, "dt_index_read" },
+ { LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_open" },
+ { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+ { LLOG_ORIGIN_HANDLE_READ_HEADER, "llog_origin_handle_read_header" },
+ { LLOG_ORIGIN_HANDLE_WRITE_REC, "llog_origin_handle_write_rec" },
+ { LLOG_ORIGIN_HANDLE_CLOSE, "llog_origin_handle_close" },
+ { LLOG_ORIGIN_CONNECT, "llog_origin_connect" },
+ { LLOG_CATINFO, "llog_catinfo" },
+ { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+ { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" },
+ { QUOTA_DQACQ, "quota_acquire" },
+ { QUOTA_DQREL, "quota_release" },
+ { SEQ_QUERY, "seq_query" },
+ { SEC_CTX_INIT, "sec_ctx_init" },
+ { SEC_CTX_INIT_CONT, "sec_ctx_init_cont" },
+ { SEC_CTX_FINI, "sec_ctx_fini" },
+ { FLD_QUERY, "fld_query" },
+ { UPDATE_OBJ, "update_obj" },
+};
+
+static struct ll_eopcode {
+ __u32 opcode;
+ const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+ { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+ { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" },
+ { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" },
+ { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" },
+ { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" },
+ { MDS_REINT_SETATTR, "mds_reint_setattr" },
+ { MDS_REINT_CREATE, "mds_reint_create" },
+ { MDS_REINT_LINK, "mds_reint_link" },
+ { MDS_REINT_UNLINK, "mds_reint_unlink" },
+ { MDS_REINT_RENAME, "mds_reint_rename" },
+ { MDS_REINT_OPEN, "mds_reint_open" },
+ { MDS_REINT_SETXATTR, "mds_reint_setxattr" },
+ { BRW_READ_BYTES, "read_bytes" },
+ { BRW_WRITE_BYTES, "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+ /* When one of the assertions below fail, chances are that:
+ * 1) A new opcode was added in include/lustre/lustre_idl.h,
+ * but is missing from the table above.
+ * or 2) The opcode space was renumbered or rearranged,
+ * and the opcode_offset() function in
+ * ptlrpc_internal.h needs to be modified.
+ */
+ __u32 offset = opcode_offset(opcode);
+ LASSERTF(offset < LUSTRE_MAX_OPCODES,
+ "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+ offset, LUSTRE_MAX_OPCODES);
+ LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+ "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+ offset, ll_rpc_opcode_table[offset].opcode, opcode);
+ return ll_rpc_opcode_table[offset].opname;
+}
+
+static const char *ll_eopcode2str(__u32 opcode)
+{
+ LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+ return ll_eopcode_table[opcode].opname;
+}
+
+#if defined(CONFIG_PROC_FS)
+static void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+ char *name,
+ struct proc_dir_entry **procroot_ret,
+ struct lprocfs_stats **stats_ret)
+{
+ struct proc_dir_entry *svc_procroot;
+ struct lprocfs_stats *svc_stats;
+ int i, rc;
+ unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+ LPROCFS_CNTR_STDDEV;
+
+ LASSERT(*procroot_ret == NULL);
+ LASSERT(*stats_ret == NULL);
+
+ svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,
+ 0);
+ if (svc_stats == NULL)
+ return;
+
+ if (dir) {
+ svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+ if (IS_ERR(svc_procroot)) {
+ lprocfs_free_stats(&svc_stats);
+ return;
+ }
+ } else {
+ svc_procroot = root;
+ }
+
+ lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+ svc_counter_config, "req_waittime", "usec");
+ lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+ svc_counter_config, "req_qdepth", "reqs");
+ lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+ svc_counter_config, "req_active", "reqs");
+ lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+ svc_counter_config, "req_timeout", "sec");
+ lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+ svc_counter_config, "reqbuf_avail", "bufs");
+ for (i = 0; i < EXTRA_LAST_OPC; i++) {
+ char *units;
+
+ switch (i) {
+ case BRW_WRITE_BYTES:
+ case BRW_READ_BYTES:
+ units = "bytes";
+ break;
+ default:
+ units = "reqs";
+ break;
+ }
+ lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+ svc_counter_config,
+ ll_eopcode2str(i), units);
+ }
+ for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+ __u32 opcode = ll_rpc_opcode_table[i].opcode;
+ lprocfs_counter_init(svc_stats,
+ EXTRA_MAX_OPCODES + i, svc_counter_config,
+ ll_opcode2str(opcode), "usec");
+ }
+
+ rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+ if (rc < 0) {
+ if (dir)
+ lprocfs_remove(&svc_procroot);
+ lprocfs_free_stats(&svc_stats);
+ } else {
+ if (dir)
+ *procroot_ret = svc_procroot;
+ *stats_ret = svc_stats;
+ }
+}
+
+static int
+ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
+{
+ struct ptlrpc_service *svc = m->private;
+ struct ptlrpc_service_part *svcpt;
+ int total = 0;
+ int i;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc)
+ total += svcpt->scp_hist_nrqbds;
+
+ seq_printf(m, "%d\n", total);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+
+static int
+ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
+{
+ struct ptlrpc_service *svc = m->private;
+ struct ptlrpc_service_part *svcpt;
+ int total = 0;
+ int i;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc)
+ total += svc->srv_hist_nrqbds_cpt_max;
+
+ seq_printf(m, "%d\n", total);
+ return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+ int bufpages;
+ int val;
+ int rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc < 0)
+ return rc;
+
+ if (val < 0)
+ return -ERANGE;
+
+ /* This sanity check is more of an insanity check; we can still
+ * hose a kernel by allowing the request history to grow too
+ * far. */
+ bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (val > totalram_pages / (2 * bufpages))
+ return -ERANGE;
+
+ spin_lock(&svc->srv_lock);
+
+ if (val == 0)
+ svc->srv_hist_nrqbds_cpt_max = 0;
+ else
+ svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts));
+
+ spin_unlock(&svc->srv_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+
+static int
+ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
+{
+ struct ptlrpc_service *svc = m->private;
+
+ seq_printf(m, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+ return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_min_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+ int val;
+ int rc = lprocfs_write_helper(buffer, count, &val);
+
+ if (rc < 0)
+ return rc;
+
+ if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+ return -ERANGE;
+
+ spin_lock(&svc->srv_lock);
+ if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+ spin_unlock(&svc->srv_lock);
+ return -ERANGE;
+ }
+
+ svc->srv_nthrs_cpt_init = val / svc->srv_ncpts;
+
+ spin_unlock(&svc->srv_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
+
+static int
+ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
+{
+ struct ptlrpc_service *svc = m->private;
+ struct ptlrpc_service_part *svcpt;
+ int total = 0;
+ int i;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc)
+ total += svcpt->scp_nthrs_running;
+
+ seq_printf(m, "%d\n", total);
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
+
+static int
+ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
+{
+ struct ptlrpc_service *svc = m->private;
+
+ seq_printf(m, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+ return 0;
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_max_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+ int val;
+ int rc = lprocfs_write_helper(buffer, count, &val);
+
+ if (rc < 0)
+ return rc;
+
+ if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+ return -ERANGE;
+
+ spin_lock(&svc->srv_lock);
+ if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+ spin_unlock(&svc->srv_lock);
+ return -ERANGE;
+ }
+
+ svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts;
+
+ spin_unlock(&svc->srv_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+ switch (state) {
+ default:
+ LBUG();
+ case NRS_POL_STATE_INVALID:
+ return "invalid";
+ case NRS_POL_STATE_STOPPED:
+ return "stopped";
+ case NRS_POL_STATE_STOPPING:
+ return "stopping";
+ case NRS_POL_STATE_STARTING:
+ return "starting";
+ case NRS_POL_STATE_STARTED:
+ return "started";
+ }
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info Holds returned status information
+ */
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_pol_info *info)
+{
+ LASSERT(policy != NULL);
+ LASSERT(info != NULL);
+ assert_spin_locked(&policy->pol_nrs->nrs_lock);
+
+ memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+
+ info->pi_fallback = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+ info->pi_state = policy->pol_state;
+ /**
+ * XXX: These are accessed without holding
+ * ptlrpc_service_part::scp_req_lock.
+ */
+ info->pi_req_queued = policy->pol_req_queued;
+ info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
+{
+ struct ptlrpc_service *svc = m->private;
+ struct ptlrpc_service_part *svcpt;
+ struct ptlrpc_nrs *nrs;
+ struct ptlrpc_nrs_policy *policy;
+ struct ptlrpc_nrs_pol_info *infos;
+ struct ptlrpc_nrs_pol_info tmp;
+ unsigned num_pols;
+ unsigned pol_idx = 0;
+ bool hp = false;
+ int i;
+ int rc = 0;
+
+ /**
+ * Serialize NRS core lprocfs operations with policy registration/
+ * unregistration.
+ */
+ mutex_lock(&nrs_core.nrs_mutex);
+
+ /**
+ * Use the first service partition's regular NRS head in order to obtain
+ * the number of policies registered with NRS heads of this service. All
+ * service partitions will have the same number of policies.
+ */
+ nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+ spin_lock(&nrs->nrs_lock);
+ num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+ spin_unlock(&nrs->nrs_lock);
+
+ OBD_ALLOC(infos, num_pols * sizeof(*infos));
+ if (infos == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+again:
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ nrs = nrs_svcpt2nrs(svcpt, hp);
+ spin_lock(&nrs->nrs_lock);
+
+ pol_idx = 0;
+
+ list_for_each_entry(policy, &nrs->nrs_policy_list,
+ pol_list) {
+ LASSERT(pol_idx < num_pols);
+
+ nrs_policy_get_info_locked(policy, &tmp);
+ /**
+ * Copy values when handling the first service
+ * partition.
+ */
+ if (i == 0) {
+ memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+ NRS_POL_NAME_MAX);
+ memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+ sizeof(tmp.pi_state));
+ infos[pol_idx].pi_fallback = tmp.pi_fallback;
+ /**
+ * For the rest of the service partitions
+ * sanity-check the values we get.
+ */
+ } else {
+ LASSERT(strncmp(infos[pol_idx].pi_name,
+ tmp.pi_name,
+ NRS_POL_NAME_MAX) == 0);
+ /**
+ * Not asserting ptlrpc_nrs_pol_info::pi_state,
+ * because it may be different between
+ * instances of the same policy in different
+ * service partitions.
+ */
+ LASSERT(infos[pol_idx].pi_fallback ==
+ tmp.pi_fallback);
+ }
+
+ infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+ infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+ pol_idx++;
+ }
+ spin_unlock(&nrs->nrs_lock);
+ }
+
+ /**
+ * Policy status information output is in YAML format.
+ * For example:
+ *
+ * regular_requests:
+ * - name: fifo
+ * state: started
+ * fallback: yes
+ * queued: 0
+ * active: 0
+ *
+ * - name: crrn
+ * state: started
+ * fallback: no
+ * queued: 2015
+ * active: 384
+ *
+ * high_priority_requests:
+ * - name: fifo
+ * state: started
+ * fallback: yes
+ * queued: 0
+ * active: 2
+ *
+ * - name: crrn
+ * state: stopped
+ * fallback: no
+ * queued: 0
+ * active: 0
+ */
+ seq_printf(m, "%s\n",
+ !hp ? "\nregular_requests:" : "high_priority_requests:");
+
+ for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+ seq_printf(m, " - name: %s\n"
+ " state: %s\n"
+ " fallback: %s\n"
+ " queued: %-20d\n"
+ " active: %-20d\n\n",
+ infos[pol_idx].pi_name,
+ nrs_state2str(infos[pol_idx].pi_state),
+ infos[pol_idx].pi_fallback ? "yes" : "no",
+ (int)infos[pol_idx].pi_req_queued,
+ (int)infos[pol_idx].pi_req_started);
+ }
+
+ if (!hp && nrs_svc_has_hp(svc)) {
+ memset(infos, 0, num_pols * sizeof(*infos));
+
+ /**
+ * Redo the processing for the service's HP NRS heads' policies.
+ */
+ hp = true;
+ goto again;
+ }
+
+out:
+ if (infos)
+ OBD_FREE(infos, num_pols * sizeof(*infos));
+
+ mutex_unlock(&nrs_core.nrs_mutex);
+
+ return rc;
+}
+
+/**
+ * The longest valid command string is the maximum policy name size, plus the
+ * length of the " reg" substring
+ */
+#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static ssize_t ptlrpc_lprocfs_nrs_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+ enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH;
+ char *cmd;
+ char *cmd_copy = NULL;
+ char *token;
+ int rc = 0;
+
+ if (count >= LPROCFS_NRS_WR_MAX_CMD) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+ if (cmd == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ /**
+ * strsep() modifies its argument, so keep a copy
+ */
+ cmd_copy = cmd;
+
+ if (copy_from_user(cmd, buffer, count)) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ cmd[count] = '\0';
+
+ token = strsep(&cmd, " ");
+
+ if (strlen(token) > NRS_POL_NAME_MAX - 1) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /**
+ * No [reg|hp] token has been specified
+ */
+ if (cmd == NULL)
+ goto default_queue;
+
+ /**
+ * The second token is either NULL, or an optional [reg|hp] string
+ */
+ if (strcmp(cmd, "reg") == 0)
+ queue = PTLRPC_NRS_QUEUE_REG;
+ else if (strcmp(cmd, "hp") == 0)
+ queue = PTLRPC_NRS_QUEUE_HP;
+ else {
+ rc = -EINVAL;
+ goto out;
+ }
+
+default_queue:
+
+ if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) {
+ rc = -ENODEV;
+ goto out;
+ } else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+ queue = PTLRPC_NRS_QUEUE_REG;
+
+ /**
+ * Serialize NRS core lprocfs operations with policy registration/
+ * unregistration.
+ */
+ mutex_lock(&nrs_core.nrs_mutex);
+
+ rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START,
+ false, NULL);
+
+ mutex_unlock(&nrs_core.nrs_mutex);
+out:
+ if (cmd_copy)
+ OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+ return rc < 0 ? rc : count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+ int srhi_idx;
+ __u64 srhi_seq;
+ struct ptlrpc_request *srhi_req;
+};
+
+static int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_srh_iterator *srhi,
+ __u64 seq)
+{
+ struct list_head *e;
+ struct ptlrpc_request *req;
+
+ if (srhi->srhi_req != NULL &&
+ srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+ srhi->srhi_seq <= seq) {
+ /* If srhi_req was set previously, hasn't been culled and
+ * we're searching for a seq on or after it (i.e. more
+ * recent), search from it onwards.
+ * Since the service history is LRU (i.e. culled reqs will
+ * be near the head), we shouldn't have to do long
+ * re-scans */
+ LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+ "%s:%d: seek seq %llu, request seq %llu\n",
+ svcpt->scp_service->srv_name, svcpt->scp_cpt,
+ srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+ LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+ "%s:%d: seek offset %llu, request seq %llu, last culled %llu\n",
+ svcpt->scp_service->srv_name, svcpt->scp_cpt,
+ seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+ e = &srhi->srhi_req->rq_history_list;
+ } else {
+ /* search from start */
+ e = svcpt->scp_hist_reqs.next;
+ }
+
+ while (e != &svcpt->scp_hist_reqs) {
+ req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+ if (req->rq_history_seq >= seq) {
+ srhi->srhi_seq = req->rq_history_seq;
+ srhi->srhi_req = req;
+ return 0;
+ }
+ e = e->next;
+ }
+
+ return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos) \
+ ((svc)->srv_cpt_bits == 0 ? 0 : \
+ (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt) \
+ ((svc)->srv_cpt_bits == 0 ? 0 : \
+ (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq) \
+ ((svc)->srv_cpt_bits == 0 ? (seq) : \
+ ((seq) >> (svc)->srv_cpt_bits) | \
+ ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos) \
+ ((svc)->srv_cpt_bits == 0 ? (pos) : \
+ ((__u64)(pos) << (svc)->srv_cpt_bits) | \
+ ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+ struct ptlrpc_service *svc = s->private;
+ struct ptlrpc_service_part *svcpt;
+ struct ptlrpc_srh_iterator *srhi;
+ unsigned int cpt;
+ int rc;
+ int i;
+
+ if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+ CWARN("Failed to read request history because size of loff_t %d can't match size of u64\n",
+ (int)sizeof(loff_t));
+ return NULL;
+ }
+
+ OBD_ALLOC(srhi, sizeof(*srhi));
+ if (srhi == NULL)
+ return NULL;
+
+ srhi->srhi_seq = 0;
+ srhi->srhi_req = NULL;
+
+ cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if (i < cpt) /* skip */
+ continue;
+ if (i > cpt) /* make up the lowest position for this CPT */
+ *pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+ spin_lock(&svcpt->scp_lock);
+ rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+ PTLRPC_REQ_POS2SEQ(svc, *pos));
+ spin_unlock(&svcpt->scp_lock);
+ if (rc == 0) {
+ *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+ srhi->srhi_idx = i;
+ return srhi;
+ }
+ }
+
+ OBD_FREE(srhi, sizeof(*srhi));
+ return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+ struct ptlrpc_srh_iterator *srhi = iter;
+
+ if (srhi != NULL)
+ OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+ void *iter, loff_t *pos)
+{
+ struct ptlrpc_service *svc = s->private;
+ struct ptlrpc_srh_iterator *srhi = iter;
+ struct ptlrpc_service_part *svcpt;
+ __u64 seq;
+ int rc;
+ int i;
+
+ for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+ svcpt = svc->srv_parts[i];
+
+ if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+ srhi->srhi_req = NULL;
+ seq = srhi->srhi_seq = 0;
+ } else { /* the next sequence */
+ seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+ }
+
+ spin_lock(&svcpt->scp_lock);
+ rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+ spin_unlock(&svcpt->scp_lock);
+ if (rc == 0) {
+ *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+ srhi->srhi_idx = i;
+ return srhi;
+ }
+ }
+
+ OBD_FREE(srhi, sizeof(*srhi));
+ return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+ /* Called holding srv_lock with irqs disabled.
+ * Print specific req contents and a newline.
+ * CAVEAT EMPTOR: check request message length before printing!!!
+ * You might have received any old crap so you must be just as
+ * careful here as the service's request parser!!! */
+ struct seq_file *sf = seq_file;
+
+ switch (req->rq_phase) {
+ case RQ_PHASE_NEW:
+ /* still awaiting a service thread's attention, or rejected
+ * because the generic request message didn't unpack */
+ seq_printf(sf, "<not swabbed>\n");
+ break;
+ case RQ_PHASE_INTERPRET:
+ /* being handled, so basic msg swabbed, and opc is valid
+ * but racing with mds_handle() */
+ case RQ_PHASE_COMPLETE:
+ /* been handled by mds_handle() reply state possibly still
+ * volatile */
+ seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+ break;
+ default:
+ DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+ }
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+ struct ptlrpc_service *svc = s->private;
+ struct ptlrpc_srh_iterator *srhi = iter;
+ struct ptlrpc_service_part *svcpt;
+ struct ptlrpc_request *req;
+ int rc;
+
+ LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+ svcpt = svc->srv_parts[srhi->srhi_idx];
+
+ spin_lock(&svcpt->scp_lock);
+
+ rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+ if (rc == 0) {
+ req = srhi->srhi_req;
+
+ /* Print common req fields.
+ * CAVEAT EMPTOR: we're racing with the service handler
+ * here. The request could contain any old crap, so you
+ * must be just as careful as the service's request
+ * parser. Currently I only print stuff here I know is OK
+ * to look at coz it was set up in request_in_callback()!!! */
+ seq_printf(s, "%lld:%s:%s:x%llu:%d:%s:%ld:%lds(%+lds) ",
+ req->rq_history_seq, libcfs_nid2str(req->rq_self),
+ libcfs_id2str(req->rq_peer), req->rq_xid,
+ req->rq_reqlen, ptlrpc_rqphase2str(req),
+ req->rq_arrival_time.tv_sec,
+ req->rq_sent - req->rq_arrival_time.tv_sec,
+ req->rq_sent - req->rq_deadline);
+ if (svc->srv_ops.so_req_printer == NULL)
+ seq_printf(s, "\n");
+ else
+ svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+ }
+
+ spin_unlock(&svcpt->scp_lock);
+ return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+ static struct seq_operations sops = {
+ .start = ptlrpc_lprocfs_svc_req_history_start,
+ .stop = ptlrpc_lprocfs_svc_req_history_stop,
+ .next = ptlrpc_lprocfs_svc_req_history_next,
+ .show = ptlrpc_lprocfs_svc_req_history_show,
+ };
+ struct seq_file *seqf;
+ int rc;
+
+ rc = seq_open(file, &sops);
+ if (rc)
+ return rc;
+
+ seqf = file->private_data;
+ seqf->private = PDE_DATA(inode);
+ return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
+{
+ struct ptlrpc_service *svc = m->private;
+ struct ptlrpc_service_part *svcpt;
+ struct dhms ts;
+ time_t worstt;
+ unsigned int cur;
+ unsigned int worst;
+ int i;
+
+ if (AT_OFF) {
+ seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n",
+ obd_timeout);
+ return 0;
+ }
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ cur = at_get(&svcpt->scp_at_estimate);
+ worst = svcpt->scp_at_estimate.at_worst_ever;
+ worstt = svcpt->scp_at_estimate.at_worst_time;
+ s2dhms(&ts, get_seconds() - worstt);
+
+ seq_printf(m, "%10s : cur %3u worst %3u (at %ld, "
+ DHMS_FMT" ago) ", "service",
+ cur, worst, worstt, DHMS_VARS(&ts));
+
+ lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate);
+ }
+
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
+{
+ struct ptlrpc_service *svc = m->private;
+ seq_printf(m, "%d", svc->srv_hpreq_ratio);
+ return 0;
+}
+
+static ssize_t ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count,
+ loff_t *off)
+{
+ struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+ int rc;
+ int val;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc < 0)
+ return rc;
+
+ if (val < 0)
+ return -ERANGE;
+
+ spin_lock(&svc->srv_lock);
+ svc->srv_hpreq_ratio = val;
+ spin_unlock(&svc->srv_lock);
+
+ return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
+
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+ struct ptlrpc_service *svc)
+{
+ struct lprocfs_vars lproc_vars[] = {
+ {.name = "high_priority_ratio",
+ .fops = &ptlrpc_lprocfs_hp_ratio_fops,
+ .data = svc},
+ {.name = "req_buffer_history_len",
+ .fops = &ptlrpc_lprocfs_req_history_len_fops,
+ .data = svc},
+ {.name = "req_buffer_history_max",
+ .fops = &ptlrpc_lprocfs_req_history_max_fops,
+ .data = svc},
+ {.name = "threads_min",
+ .fops = &ptlrpc_lprocfs_threads_min_fops,
+ .data = svc},
+ {.name = "threads_max",
+ .fops = &ptlrpc_lprocfs_threads_max_fops,
+ .data = svc},
+ {.name = "threads_started",
+ .fops = &ptlrpc_lprocfs_threads_started_fops,
+ .data = svc},
+ {.name = "timeouts",
+ .fops = &ptlrpc_lprocfs_timeouts_fops,
+ .data = svc},
+ {.name = "nrs_policies",
+ .fops = &ptlrpc_lprocfs_nrs_fops,
+ .data = svc},
+ {NULL}
+ };
+ static const struct file_operations req_history_fops = {
+ .owner = THIS_MODULE,
+ .open = ptlrpc_lprocfs_svc_req_history_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = lprocfs_seq_release,
+ };
+
+ int rc;
+
+ ptlrpc_lprocfs_register(entry, svc->srv_name,
+ "stats", &svc->srv_procroot,
+ &svc->srv_stats);
+
+ if (svc->srv_procroot == NULL)
+ return;
+
+ lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+
+ rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+ 0400, &req_history_fops, svc);
+ if (rc)
+ CWARN("Error adding the req_history file\n");
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
+{
+ ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+ &obddev->obd_svc_procroot,
+ &obddev->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+ struct lprocfs_stats *svc_stats;
+ __u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+ int opc = opcode_offset(op);
+
+ svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+ if (svc_stats == NULL || opc <= 0)
+ return;
+ LASSERT(opc < LUSTRE_MAX_OPCODES);
+ if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+ lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+ struct lprocfs_stats *svc_stats;
+ int idx;
+
+ if (!req->rq_import)
+ return;
+ svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+ if (!svc_stats)
+ return;
+ idx = lustre_msg_get_opc(req->rq_reqmsg);
+ switch (idx) {
+ case OST_READ:
+ idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+ break;
+ case OST_WRITE:
+ idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+ break;
+ default:
+ LASSERTF(0, "unsupported opcode %u\n", idx);
+ break;
+ }
+
+ lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+ if (svc->srv_procroot != NULL)
+ lprocfs_remove(&svc->srv_procroot);
+
+ if (svc->srv_stats)
+ lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+ if (obd->obd_svc_procroot)
+ lprocfs_remove(&obd->obd_svc_procroot);
+
+ if (obd->obd_svc_stats)
+ lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+
+#define BUFLEN (UUID_MAX + 5)
+
+int lprocfs_wr_evict_client(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ char *kbuf;
+ char *tmpbuf;
+
+ OBD_ALLOC(kbuf, BUFLEN);
+ if (kbuf == NULL)
+ return -ENOMEM;
+
+ /*
+ * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+ * bytes into kbuf, to ensure that the string is NUL-terminated.
+ * UUID_MAX should include a trailing NUL already.
+ */
+ if (copy_from_user(kbuf, buffer,
+ min_t(unsigned long, BUFLEN - 1, count))) {
+ count = -EFAULT;
+ goto out;
+ }
+ tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count));
+ /* Kludge code(deadlock situation): the lprocfs lock has been held
+ * since the client is evicted by writing client's
+ * uuid/nid to procfs "evict_client" entry. However,
+ * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy
+ * the proc entries under the being destroyed export{}, so I have
+ * to drop the lock at first here.
+ * - jay, jxiong@clusterfs.com */
+ class_incref(obd, __func__, current);
+
+ if (strncmp(tmpbuf, "nid:", 4) == 0)
+ obd_export_evict_by_nid(obd, tmpbuf + 4);
+ else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+ obd_export_evict_by_uuid(obd, tmpbuf + 5);
+ else
+ obd_export_evict_by_uuid(obd, tmpbuf);
+
+ class_decref(obd, __func__, current);
+
+out:
+ OBD_FREE(kbuf, BUFLEN);
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_evict_client);
+
+#undef BUFLEN
+
+int lprocfs_wr_ping(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ struct ptlrpc_request *req;
+ int rc;
+
+ LPROCFS_CLIMP_CHECK(obd);
+ req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+ LPROCFS_CLIMP_EXIT(obd);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req->rq_send_state = LUSTRE_IMP_FULL;
+
+ rc = ptlrpc_queue_wait(req);
+
+ ptlrpc_req_finished(req);
+ if (rc >= 0)
+ return count;
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_wr_ping);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+int lprocfs_wr_import(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ struct obd_import *imp = obd->u.cli.cl_import;
+ char *kbuf = NULL;
+ char *uuid;
+ char *ptr;
+ int do_reconn = 1;
+ const char prefix[] = "connection=";
+ const int prefix_len = sizeof(prefix) - 1;
+
+ if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len)
+ return -EINVAL;
+
+ OBD_ALLOC(kbuf, count + 1);
+ if (kbuf == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(kbuf, buffer, count)) {
+ count = -EFAULT;
+ goto out;
+ }
+
+ kbuf[count] = 0;
+
+ /* only support connection=uuid::instance now */
+ if (strncmp(prefix, kbuf, prefix_len) != 0) {
+ count = -EINVAL;
+ goto out;
+ }
+
+ uuid = kbuf + prefix_len;
+ ptr = strstr(uuid, "::");
+ if (ptr) {
+ __u32 inst;
+ char *endptr;
+
+ *ptr = 0;
+ do_reconn = 0;
+ ptr += strlen("::");
+ inst = simple_strtol(ptr, &endptr, 10);
+ if (*endptr) {
+ CERROR("config: wrong instance # %s\n", ptr);
+ } else if (inst != imp->imp_connect_data.ocd_instance) {
+ CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted target(%u/%u), reconnecting...\n",
+ imp->imp_obd->obd_name,
+ imp->imp_connect_data.ocd_instance, inst);
+ do_reconn = 1;
+ } else {
+ CDEBUG(D_INFO, "IR: %s has already been connecting to new target(%u)\n",
+ imp->imp_obd->obd_name, inst);
+ }
+ }
+
+ if (do_reconn)
+ ptlrpc_recover_import(imp, uuid, 1);
+
+out:
+ OBD_FREE(kbuf, count + 1);
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_import);
+
+int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{
+ struct obd_device *obd = m->private;
+ struct obd_import *imp = obd->u.cli.cl_import;
+
+ LPROCFS_CLIMP_CHECK(obd);
+ seq_printf(m, "%d\n", !imp->imp_no_pinger_recover);
+ LPROCFS_CLIMP_EXIT(obd);
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_pinger_recov);
+
+int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+ struct client_obd *cli = &obd->u.cli;
+ struct obd_import *imp = cli->cl_import;
+ int rc, val;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc < 0)
+ return rc;
+
+ if (val != 0 && val != 1)
+ return -ERANGE;
+
+ LPROCFS_CLIMP_CHECK(obd);
+ spin_lock(&imp->imp_lock);
+ imp->imp_no_pinger_recover = !val;
+ spin_unlock(&imp->imp_lock);
+ LPROCFS_CLIMP_EXIT(obd);
+
+ return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_pinger_recov);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
new file mode 100644
index 000000000..2fa258558
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
@@ -0,0 +1,731 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_lib.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "ptlrpc_internal.h"
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len,
+ lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+ struct ptlrpc_connection *conn, int portal, __u64 xid,
+ unsigned int offset)
+{
+ int rc;
+ lnet_md_t md;
+
+ LASSERT(portal != 0);
+ LASSERT(conn != NULL);
+ CDEBUG(D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
+ md.start = base;
+ md.length = len;
+ md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+ md.options = PTLRPC_MD_OPTIONS;
+ md.user_ptr = cbid;
+ md.eq_handle = ptlrpc_eq_h;
+
+ if (unlikely(ack == LNET_ACK_REQ &&
+ OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK,
+ OBD_FAIL_ONCE))) {
+ /* don't ask for the ack to simulate failing client */
+ ack = LNET_NOACK_REQ;
+ }
+
+ rc = LNetMDBind(md, LNET_UNLINK, mdh);
+ if (unlikely(rc != 0)) {
+ CERROR("LNetMDBind failed: %d\n", rc);
+ LASSERT(rc == -ENOMEM);
+ return -ENOMEM;
+ }
+
+ CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
+ len, portal, xid, offset);
+
+ rc = LNetPut(conn->c_self, *mdh, ack,
+ conn->c_peer, portal, xid, offset, 0);
+ if (unlikely(rc != 0)) {
+ int rc2;
+ /* We're going to get an UNLINK event when I unlink below,
+ * which will complete just like any other failed send, so
+ * I fall through and return success here! */
+ CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
+ libcfs_id2str(conn->c_peer), portal, xid, rc);
+ rc2 = LNetMDUnlink(*mdh);
+ LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+ }
+
+ return 0;
+}
+
+static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ LNetMDUnlink(bd_mds[i]);
+}
+
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+ struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+ lnet_process_id_t peer;
+ int rc = 0;
+ int rc2;
+ int posted_md;
+ int total_md;
+ __u64 xid;
+ lnet_handle_me_t me_h;
+ lnet_md_t md;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+ return 0;
+
+ /* NB no locking required until desc is on the network */
+ LASSERT(desc->bd_nob > 0);
+ LASSERT(desc->bd_md_count == 0);
+ LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+ LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+ LASSERT(desc->bd_req != NULL);
+ LASSERT(desc->bd_type == BULK_PUT_SINK ||
+ desc->bd_type == BULK_GET_SOURCE);
+
+ /* cleanup the state of the bulk for it will be reused */
+ if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+ desc->bd_nob_transferred = 0;
+ else
+ LASSERT(desc->bd_nob_transferred == 0);
+
+ desc->bd_failure = 0;
+
+ peer = desc->bd_import->imp_connection->c_peer;
+
+ LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+ LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+ /* An XID is only used for a single request from the client.
+ * For retried bulk transfers, a new XID will be allocated in
+ * in ptlrpc_check_set() if it needs to be resent, so it is not
+ * using the same RDMA match bits after an error.
+ *
+ * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The
+ * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */
+ xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1);
+ LASSERTF(!(desc->bd_registered &&
+ req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+ xid != desc->bd_last_xid,
+ "registered: %d rq_xid: %llu bd_last_xid: %llu\n",
+ desc->bd_registered, xid, desc->bd_last_xid);
+
+ total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+ desc->bd_registered = 1;
+ desc->bd_last_xid = xid;
+ desc->bd_md_count = total_md;
+ md.user_ptr = &desc->bd_cbid;
+ md.eq_handle = ptlrpc_eq_h;
+ md.threshold = 1; /* PUT or GET */
+
+ for (posted_md = 0; posted_md < total_md; posted_md++, xid++) {
+ md.options = PTLRPC_MD_OPTIONS |
+ ((desc->bd_type == BULK_GET_SOURCE) ?
+ LNET_MD_OP_GET : LNET_MD_OP_PUT);
+ ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+ rc = LNetMEAttach(desc->bd_portal, peer, xid, 0,
+ LNET_UNLINK, LNET_INS_AFTER, &me_h);
+ if (rc != 0) {
+ CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n",
+ desc->bd_import->imp_obd->obd_name, xid,
+ posted_md, rc);
+ break;
+ }
+
+ /* About to let the network at it... */
+ rc = LNetMDAttach(me_h, md, LNET_UNLINK,
+ &desc->bd_mds[posted_md]);
+ if (rc != 0) {
+ CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n",
+ desc->bd_import->imp_obd->obd_name, xid,
+ posted_md, rc);
+ rc2 = LNetMEUnlink(me_h);
+ LASSERT(rc2 == 0);
+ break;
+ }
+ }
+
+ if (rc != 0) {
+ LASSERT(rc == -ENOMEM);
+ spin_lock(&desc->bd_lock);
+ desc->bd_md_count -= total_md - posted_md;
+ spin_unlock(&desc->bd_lock);
+ LASSERT(desc->bd_md_count >= 0);
+ mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+ req->rq_status = -ENOMEM;
+ return -ENOMEM;
+ }
+
+ /* Set rq_xid to matchbits of the final bulk so that server can
+ * infer the number of bulks that were prepared */
+ req->rq_xid = --xid;
+ LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK),
+ "bd_last_xid = x%llu, rq_xid = x%llu\n",
+ desc->bd_last_xid, req->rq_xid);
+
+ spin_lock(&desc->bd_lock);
+ /* Holler if peer manages to touch buffers before he knows the xid */
+ if (desc->bd_md_count != total_md)
+ CWARN("%s: Peer %s touched %d buffers while I registered\n",
+ desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer),
+ total_md - desc->bd_md_count);
+ spin_unlock(&desc->bd_lock);
+
+ CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, xid x%#llx-%#llx, portal %u\n",
+ desc->bd_md_count,
+ desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
+ desc->bd_iov_count, desc->bd_nob,
+ desc->bd_last_xid, req->rq_xid, desc->bd_portal);
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_register_bulk);
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+ struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+ wait_queue_head_t *wq;
+ struct l_wait_info lwi;
+ int rc;
+
+ LASSERT(!in_interrupt()); /* might sleep */
+
+ /* Let's setup deadline for reply unlink. */
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+ async && req->rq_bulk_deadline == 0)
+ req->rq_bulk_deadline = get_seconds() + LONG_UNLINK;
+
+ if (ptlrpc_client_bulk_active(req) == 0) /* completed or */
+ return 1; /* never registered */
+
+ LASSERT(desc->bd_req == req); /* bd_req NULL until registered */
+
+ /* the unlink ensures the callback happens ASAP and is the last
+ * one. If it fails, it must be because completion just happened,
+ * but we must still l_wait_event() in this case to give liblustre
+ * a chance to run client_bulk_callback() */
+ mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+ if (ptlrpc_client_bulk_active(req) == 0) /* completed or */
+ return 1; /* never registered */
+
+ /* Move to "Unregistering" phase as bulk was not unlinked yet. */
+ ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+
+ /* Do not wait for unlink to finish. */
+ if (async)
+ return 0;
+
+ if (req->rq_set != NULL)
+ wq = &req->rq_set->set_waitq;
+ else
+ wq = &req->rq_reply_waitq;
+
+ for (;;) {
+ /* Network access will complete in finite time but the HUGE
+ * timeout lets us CWARN for visibility of sluggish NALs */
+ lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+ cfs_time_seconds(1), NULL, NULL);
+ rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+ if (rc == 0) {
+ ptlrpc_rqphase_move(req, req->rq_next_phase);
+ return 1;
+ }
+
+ LASSERT(rc == -ETIMEDOUT);
+ DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+ desc);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+ struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ int service_time = max_t(int, get_seconds() -
+ req->rq_arrival_time.tv_sec, 1);
+
+ if (!(flags & PTLRPC_REPLY_EARLY) &&
+ (req->rq_type != PTL_RPC_MSG_ERR) &&
+ (req->rq_reqmsg != NULL) &&
+ !(lustre_msg_get_flags(req->rq_reqmsg) &
+ (MSG_RESENT | MSG_REPLAY |
+ MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+ /* early replies, errors and recovery requests don't count
+ * toward our service time estimate */
+ int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+
+ if (oldse != 0) {
+ DEBUG_REQ(D_ADAPTTO, req,
+ "svc %s changed estimate from %d to %d",
+ svc->srv_name, oldse,
+ at_get(&svcpt->scp_at_estimate));
+ }
+ }
+ /* Report actual service time for client latency calc */
+ lustre_msg_set_service_time(req->rq_repmsg, service_time);
+ /* Report service time estimate for future client reqs, but report 0
+ * (to be ignored by client) if it's a error reply during recovery.
+ * (bz15815) */
+ if (req->rq_type == PTL_RPC_MSG_ERR &&
+ (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
+ lustre_msg_set_timeout(req->rq_repmsg, 0);
+ else
+ lustre_msg_set_timeout(req->rq_repmsg,
+ at_get(&svcpt->scp_at_estimate));
+
+ if (req->rq_reqmsg &&
+ !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+ CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x req_flags=%#x magic=%d:%x/%x len=%d\n",
+ flags, lustre_msg_get_flags(req->rq_reqmsg),
+ lustre_msg_is_v1(req->rq_reqmsg),
+ lustre_msg_get_magic(req->rq_reqmsg),
+ lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+ }
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on success or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct ptlrpc_connection *conn;
+ int rc;
+
+ /* We must already have a reply buffer (only ptlrpc_error() may be
+ * called without one). The reply generated by sptlrpc layer (e.g.
+ * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+ * have a request buffer which is either the actual (swabbed) incoming
+ * request, or a saved copy if this is a req saved in
+ * target_queue_final_reply().
+ */
+ LASSERT(req->rq_no_reply == 0);
+ LASSERT(req->rq_reqbuf != NULL);
+ LASSERT(rs != NULL);
+ LASSERT((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+ LASSERT(req->rq_repmsg != NULL);
+ LASSERT(req->rq_repmsg == rs->rs_msg);
+ LASSERT(rs->rs_cb_id.cbid_fn == reply_out_callback);
+ LASSERT(rs->rs_cb_id.cbid_arg == rs);
+
+ /* There may be no rq_export during failover */
+
+ if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+ req->rq_export->exp_obd->obd_fail)) {
+ /* Failed obd's only send ENODEV */
+ req->rq_type = PTL_RPC_MSG_ERR;
+ req->rq_status = -ENODEV;
+ CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+ req->rq_export->exp_obd->obd_minor);
+ }
+
+ /* In order to keep interoperability with the client (< 2.3) which
+ * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
+ * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
+ * reply buffer on client will be overflow.
+ *
+ * XXX Remove this whenever we drop the interoperability with
+ * such client.
+ */
+ req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
+ sizeof(struct ptlrpc_body_v2), 1);
+
+ if (req->rq_type != PTL_RPC_MSG_ERR)
+ req->rq_type = PTL_RPC_MSG_REPLY;
+
+ lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+ lustre_msg_set_status(req->rq_repmsg,
+ ptlrpc_status_hton(req->rq_status));
+ lustre_msg_set_opc(req->rq_repmsg,
+ req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+ target_pack_pool_reply(req);
+
+ ptlrpc_at_set_reply(req, flags);
+
+ if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+ conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+ else
+ conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+ if (unlikely(conn == NULL)) {
+ CERROR("not replying on NULL connection\n"); /* bug 9635 */
+ return -ENOTCONN;
+ }
+ ptlrpc_rs_addref(rs); /* +1 ref for the network */
+
+ rc = sptlrpc_svc_wrap_reply(req);
+ if (unlikely(rc))
+ goto out;
+
+ req->rq_sent = get_seconds();
+
+ rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+ (rs->rs_difficult && !rs->rs_no_ack) ?
+ LNET_ACK_REQ : LNET_NOACK_REQ,
+ &rs->rs_cb_id, conn,
+ ptlrpc_req2svc(req)->srv_rep_portal,
+ req->rq_xid, req->rq_reply_off);
+out:
+ if (unlikely(rc != 0))
+ ptlrpc_req_drop_rs(req);
+ ptlrpc_connection_put(conn);
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_reply);
+
+int ptlrpc_reply(struct ptlrpc_request *req)
+{
+ if (req->rq_no_reply)
+ return 0;
+ return ptlrpc_send_reply(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_reply);
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+ int rc;
+
+ if (req->rq_no_reply)
+ return 0;
+
+ if (!req->rq_repmsg) {
+ rc = lustre_pack_reply(req, 1, NULL, NULL);
+ if (rc)
+ return rc;
+ }
+
+ if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+ req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+ req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+ req->rq_type = PTL_RPC_MSG_ERR;
+
+ rc = ptlrpc_send_reply(req, may_be_difficult);
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_error);
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+ return ptlrpc_send_error(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_error);
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+ int rc;
+ int rc2;
+ int mpflag = 0;
+ struct ptlrpc_connection *connection;
+ lnet_handle_me_t reply_me_h;
+ lnet_md_t reply_md;
+ struct obd_device *obd = request->rq_import->imp_obd;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+ return 0;
+
+ LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+ LASSERT(request->rq_wait_ctx == 0);
+
+ /* If this is a re-transmit, we're required to have disengaged
+ * cleanly from the previous attempt */
+ LASSERT(!request->rq_receiving_reply);
+ LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) &&
+ (request->rq_import->imp_state == LUSTRE_IMP_FULL)));
+
+ if (unlikely(obd != NULL && obd->obd_fail)) {
+ CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+ obd->obd_name);
+ /* this prevents us from waiting in ptlrpc_queue_wait */
+ spin_lock(&request->rq_lock);
+ request->rq_err = 1;
+ spin_unlock(&request->rq_lock);
+ request->rq_status = -ENODEV;
+ return -ENODEV;
+ }
+
+ connection = request->rq_import->imp_connection;
+
+ lustre_msg_set_handle(request->rq_reqmsg,
+ &request->rq_import->imp_remote_handle);
+ lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+ lustre_msg_set_conn_cnt(request->rq_reqmsg,
+ request->rq_import->imp_conn_cnt);
+ lustre_msghdr_set_flags(request->rq_reqmsg,
+ request->rq_import->imp_msghdr_flags);
+
+ if (request->rq_resend)
+ lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
+ if (request->rq_memalloc)
+ mpflag = cfs_memory_pressure_get_and_set();
+
+ rc = sptlrpc_cli_wrap_request(request);
+ if (rc)
+ goto out;
+
+ /* bulk register should be done after wrap_request() */
+ if (request->rq_bulk != NULL) {
+ rc = ptlrpc_register_bulk(request);
+ if (rc != 0)
+ goto out;
+ }
+
+ if (!noreply) {
+ LASSERT(request->rq_replen != 0);
+ if (request->rq_repbuf == NULL) {
+ LASSERT(request->rq_repdata == NULL);
+ LASSERT(request->rq_repmsg == NULL);
+ rc = sptlrpc_cli_alloc_repbuf(request,
+ request->rq_replen);
+ if (rc) {
+ /* this prevents us from looping in
+ * ptlrpc_queue_wait */
+ spin_lock(&request->rq_lock);
+ request->rq_err = 1;
+ spin_unlock(&request->rq_lock);
+ request->rq_status = rc;
+ goto cleanup_bulk;
+ }
+ } else {
+ request->rq_repdata = NULL;
+ request->rq_repmsg = NULL;
+ }
+
+ rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+ connection->c_peer, request->rq_xid, 0,
+ LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+ if (rc != 0) {
+ CERROR("LNetMEAttach failed: %d\n", rc);
+ LASSERT(rc == -ENOMEM);
+ rc = -ENOMEM;
+ goto cleanup_bulk;
+ }
+ }
+
+ spin_lock(&request->rq_lock);
+ /* If the MD attach succeeds, there _will_ be a reply_in callback */
+ request->rq_receiving_reply = !noreply;
+ request->rq_req_unlink = 1;
+ /* We are responsible for unlinking the reply buffer */
+ request->rq_reply_unlink = !noreply;
+ /* Clear any flags that may be present from previous sends. */
+ request->rq_replied = 0;
+ request->rq_err = 0;
+ request->rq_timedout = 0;
+ request->rq_net_err = 0;
+ request->rq_resend = 0;
+ request->rq_restart = 0;
+ request->rq_reply_truncate = 0;
+ spin_unlock(&request->rq_lock);
+
+ if (!noreply) {
+ reply_md.start = request->rq_repbuf;
+ reply_md.length = request->rq_repbuf_len;
+ /* Allow multiple early replies */
+ reply_md.threshold = LNET_MD_THRESH_INF;
+ /* Manage remote for early replies */
+ reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+ LNET_MD_MANAGE_REMOTE |
+ LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+ reply_md.user_ptr = &request->rq_reply_cbid;
+ reply_md.eq_handle = ptlrpc_eq_h;
+
+ /* We must see the unlink callback to unset rq_reply_unlink,
+ so we can't auto-unlink */
+ rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+ &request->rq_reply_md_h);
+ if (rc != 0) {
+ CERROR("LNetMDAttach failed: %d\n", rc);
+ LASSERT(rc == -ENOMEM);
+ spin_lock(&request->rq_lock);
+ /* ...but the MD attach didn't succeed... */
+ request->rq_receiving_reply = 0;
+ spin_unlock(&request->rq_lock);
+ rc = -ENOMEM;
+ goto cleanup_me;
+ }
+
+ CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n",
+ request->rq_repbuf_len, request->rq_xid,
+ request->rq_reply_portal);
+ }
+
+ /* add references on request for request_out_callback */
+ ptlrpc_request_addref(request);
+ if (obd != NULL && obd->obd_svc_stats != NULL)
+ lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+ atomic_read(&request->rq_import->imp_inflight));
+
+ OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+ do_gettimeofday(&request->rq_arrival_time);
+ request->rq_sent = get_seconds();
+ /* We give the server rq_timeout secs to process the req, and
+ add the network latency for our local timeout. */
+ request->rq_deadline = request->rq_sent + request->rq_timeout +
+ ptlrpc_at_get_net_latency(request);
+
+ ptlrpc_pinger_sending_on_import(request->rq_import);
+
+ DEBUG_REQ(D_INFO, request, "send flg=%x",
+ lustre_msg_get_flags(request->rq_reqmsg));
+ rc = ptl_send_buf(&request->rq_req_md_h,
+ request->rq_reqbuf, request->rq_reqdata_len,
+ LNET_NOACK_REQ, &request->rq_req_cbid,
+ connection,
+ request->rq_request_portal,
+ request->rq_xid, 0);
+ if (rc == 0)
+ goto out;
+
+ ptlrpc_req_finished(request);
+ if (noreply)
+ goto out;
+
+ cleanup_me:
+ /* MEUnlink is safe; the PUT didn't even get off the ground, and
+ * nobody apart from the PUT's target has the right nid+XID to
+ * access the reply buffer. */
+ rc2 = LNetMEUnlink(reply_me_h);
+ LASSERT(rc2 == 0);
+ /* UNLINKED callback called synchronously */
+ LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+ /* We do sync unlink here as there was no real transfer here so
+ * the chance to have long unlink to sluggish net is smaller here. */
+ ptlrpc_unregister_bulk(request, 0);
+ out:
+ if (request->rq_memalloc)
+ cfs_memory_pressure_restore(mpflag);
+ return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+ struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service;
+ static lnet_process_id_t match_id = {LNET_NID_ANY, LNET_PID_ANY};
+ int rc;
+ lnet_md_t md;
+ lnet_handle_me_t me_h;
+
+ CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
+ service->srv_req_portal);
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+ return -ENOMEM;
+
+ /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+ * which means buffer can only be attached on local CPT, and LND
+ * threads can find it by grabbing a local lock */
+ rc = LNetMEAttach(service->srv_req_portal,
+ match_id, 0, ~0, LNET_UNLINK,
+ rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+ LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
+ if (rc != 0) {
+ CERROR("LNetMEAttach failed: %d\n", rc);
+ return -ENOMEM;
+ }
+
+ LASSERT(rqbd->rqbd_refcount == 0);
+ rqbd->rqbd_refcount = 1;
+
+ md.start = rqbd->rqbd_buffer;
+ md.length = service->srv_buf_size;
+ md.max_size = service->srv_max_req_size;
+ md.threshold = LNET_MD_THRESH_INF;
+ md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+ md.user_ptr = &rqbd->rqbd_cbid;
+ md.eq_handle = ptlrpc_eq_h;
+
+ rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
+ if (rc == 0)
+ return 0;
+
+ CERROR("LNetMDAttach failed: %d;\n", rc);
+ LASSERT(rc == -ENOMEM);
+ rc = LNetMEUnlink(me_h);
+ LASSERT(rc == 0);
+ rqbd->rqbd_refcount = 0;
+
+ return -ENOMEM;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/drivers/staging/lustre/lustre/ptlrpc/nrs.c
new file mode 100644
index 000000000..81ad74732
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs.c
@@ -0,0 +1,1754 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lprocfs_status.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "ptlrpc_internal.h"
+
+/* XXX: This is just for liblustre. Remove the #if defined directive when the
+ * "cfs_" prefix is dropped from cfs_list_head. */
+extern struct list_head ptlrpc_all_services;
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+ return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+ policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+ LASSERT(policy->pol_ref == 0);
+ LASSERT(policy->pol_req_queued == 0);
+
+ if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+ policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+ enum ptlrpc_nrs_ctl opc, void *arg)
+{
+ /**
+ * The policy may be stopped, but the lprocfs files and
+ * ptlrpc_nrs_policy instances remain present until unregistration time.
+ * Do not perform the ctl operation if the policy is stopped, as
+ * policy->pol_private will be NULL in such a case.
+ */
+ if (policy->pol_state == NRS_POL_STATE_STOPPED)
+ return -ENODEV;
+
+ return policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+ policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+ -ENOSYS;
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+ struct ptlrpc_nrs *nrs = policy->pol_nrs;
+
+ if (policy->pol_desc->pd_ops->op_policy_stop != NULL) {
+ spin_unlock(&nrs->nrs_lock);
+
+ policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+ spin_lock(&nrs->nrs_lock);
+ }
+
+ LASSERT(list_empty(&policy->pol_list_queued));
+ LASSERT(policy->pol_req_queued == 0 &&
+ policy->pol_req_started == 0);
+
+ policy->pol_private = NULL;
+
+ policy->pol_state = NRS_POL_STATE_STOPPED;
+
+ if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+ module_put(policy->pol_desc->pd_owner);
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+ struct ptlrpc_nrs *nrs = policy->pol_nrs;
+
+ if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+ return -EPERM;
+
+ if (policy->pol_state == NRS_POL_STATE_STARTING)
+ return -EAGAIN;
+
+ /* In progress or already stopped */
+ if (policy->pol_state != NRS_POL_STATE_STARTED)
+ return 0;
+
+ policy->pol_state = NRS_POL_STATE_STOPPING;
+
+ /* Immediately make it invisible */
+ if (nrs->nrs_policy_primary == policy) {
+ nrs->nrs_policy_primary = NULL;
+
+ } else {
+ LASSERT(nrs->nrs_policy_fallback == policy);
+ nrs->nrs_policy_fallback = NULL;
+ }
+
+ /* I have the only refcount */
+ if (policy->pol_ref == 1)
+ nrs_policy_stop0(policy);
+
+ return 0;
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+ struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+
+ if (tmp == NULL)
+ return;
+
+ nrs->nrs_policy_primary = NULL;
+
+ LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+ tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+ if (tmp->pol_ref == 0)
+ nrs_policy_stop0(tmp);
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy)
+{
+ struct ptlrpc_nrs *nrs = policy->pol_nrs;
+ int rc = 0;
+
+ /**
+ * Don't allow multiple starting which is too complex, and has no real
+ * benefit.
+ */
+ if (nrs->nrs_policy_starting)
+ return -EAGAIN;
+
+ LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+ if (policy->pol_state == NRS_POL_STATE_STOPPING)
+ return -EAGAIN;
+
+ if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+ /**
+ * This is for cases in which the user sets the policy to the
+ * fallback policy (currently fifo for all services); i.e. the
+ * user is resetting the policy to the default; so we stop the
+ * primary policy, if any.
+ */
+ if (policy == nrs->nrs_policy_fallback) {
+ nrs_policy_stop_primary(nrs);
+ return 0;
+ }
+
+ /**
+ * If we reach here, we must be setting up the fallback policy
+ * at service startup time, and only a single policy with the
+ * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+ * register with NRS core.
+ */
+ LASSERT(nrs->nrs_policy_fallback == NULL);
+ } else {
+ /**
+ * Shouldn't start primary policy if w/o fallback policy.
+ */
+ if (nrs->nrs_policy_fallback == NULL)
+ return -EPERM;
+
+ if (policy->pol_state == NRS_POL_STATE_STARTED)
+ return 0;
+ }
+
+ /**
+ * Increase the module usage count for policies registering from other
+ * modules.
+ */
+ if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+ !try_module_get(policy->pol_desc->pd_owner)) {
+ atomic_dec(&policy->pol_desc->pd_refs);
+ CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+ policy->pol_desc->pd_name);
+ return -ENODEV;
+ }
+
+ /**
+ * Serialize policy starting across the NRS head
+ */
+ nrs->nrs_policy_starting = 1;
+
+ policy->pol_state = NRS_POL_STATE_STARTING;
+
+ if (policy->pol_desc->pd_ops->op_policy_start) {
+ spin_unlock(&nrs->nrs_lock);
+
+ rc = policy->pol_desc->pd_ops->op_policy_start(policy);
+
+ spin_lock(&nrs->nrs_lock);
+ if (rc != 0) {
+ if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+ module_put(policy->pol_desc->pd_owner);
+
+ policy->pol_state = NRS_POL_STATE_STOPPED;
+ goto out;
+ }
+ }
+
+ policy->pol_state = NRS_POL_STATE_STARTED;
+
+ if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+ /**
+ * This path is only used at PTLRPC service setup time.
+ */
+ nrs->nrs_policy_fallback = policy;
+ } else {
+ /*
+ * Try to stop the current primary policy if there is one.
+ */
+ nrs_policy_stop_primary(nrs);
+
+ /**
+ * And set the newly-started policy as the primary one.
+ */
+ nrs->nrs_policy_primary = policy;
+ }
+
+out:
+ nrs->nrs_policy_starting = 0;
+
+ return rc;
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+ policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+ LASSERT(policy->pol_ref > 0);
+
+ policy->pol_ref--;
+ if (unlikely(policy->pol_ref == 0 &&
+ policy->pol_state == NRS_POL_STATE_STOPPING))
+ nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+ spin_lock(&policy->pol_nrs->nrs_lock);
+ nrs_policy_put_locked(policy);
+ spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy *nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+ char *name)
+{
+ struct ptlrpc_nrs_policy *tmp;
+
+ list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+ if (strncmp(tmp->pol_desc->pd_name, name,
+ NRS_POL_NAME_MAX) == 0) {
+ nrs_policy_get_locked(tmp);
+ return tmp;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+ struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+ if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+ struct ptlrpc_nrs_resource *parent;
+
+ for (; res != NULL; res = parent) {
+ parent = res->res_parent;
+ policy->pol_desc->pd_ops->op_res_put(policy, res);
+ }
+ }
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy the policy
+ * \param[in] nrq the request
+ * \param[in] moving_req denotes whether this is a call to the function by
+ * ldlm_lock_reorder_req(), in order to move \a nrq to
+ * the high-priority NRS head; we should not sleep when
+ * set.
+ *
+ * \retval NULL resource hierarchy references not obtained
+ * \retval valid-pointer the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource *nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq,
+ bool moving_req)
+{
+ /**
+ * Set to NULL to traverse the resource hierarchy from the top.
+ */
+ struct ptlrpc_nrs_resource *res = NULL;
+ struct ptlrpc_nrs_resource *tmp = NULL;
+ int rc;
+
+ while (1) {
+ rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+ &tmp, moving_req);
+ if (rc < 0) {
+ if (res != NULL)
+ nrs_resource_put(res);
+ return NULL;
+ }
+
+ LASSERT(tmp != NULL);
+ tmp->res_parent = res;
+ tmp->res_policy = policy;
+ res = tmp;
+ tmp = NULL;
+ /**
+ * Return once we have obtained a reference to the bottom level
+ * of the resource hierarchy.
+ */
+ if (rc > 0)
+ return res;
+ }
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in] nrs the NRS head instance that will be handling request \a nrq.
+ * \param[in] nrq the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ * stored.
+ * \param[in] moving_req is set when obtaining resources while moving a
+ * request from a policy on the regular NRS head to a
+ * policy on the HP NRS head (via
+ * ldlm_lock_reorder_req()). It signifies that
+ * allocations to get resources should be atomic; for
+ * a full explanation, see comment in
+ * ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+ struct ptlrpc_nrs_request *nrq,
+ struct ptlrpc_nrs_resource **resp,
+ bool moving_req)
+{
+ struct ptlrpc_nrs_policy *primary = NULL;
+ struct ptlrpc_nrs_policy *fallback = NULL;
+
+ memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+ /**
+ * Obtain policy references.
+ */
+ spin_lock(&nrs->nrs_lock);
+
+ fallback = nrs->nrs_policy_fallback;
+ nrs_policy_get_locked(fallback);
+
+ primary = nrs->nrs_policy_primary;
+ if (primary != NULL)
+ nrs_policy_get_locked(primary);
+
+ spin_unlock(&nrs->nrs_lock);
+
+ /**
+ * Obtain resource hierarchy references.
+ */
+ resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+ LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+ if (primary != NULL) {
+ resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+ moving_req);
+ /**
+ * A primary policy may exist which may not wish to serve a
+ * particular request for different reasons; release the
+ * reference on the policy as it will not be used for this
+ * request.
+ */
+ if (resp[NRS_RES_PRIMARY] == NULL)
+ nrs_policy_put(primary);
+ }
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+ struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+ struct ptlrpc_nrs *nrs = NULL;
+ int i;
+
+ for (i = 0; i < NRS_RES_MAX; i++) {
+ if (resp[i] != NULL) {
+ pols[i] = resp[i]->res_policy;
+ nrs_resource_put(resp[i]);
+ resp[i] = NULL;
+ } else {
+ pols[i] = NULL;
+ }
+ }
+
+ for (i = 0; i < NRS_RES_MAX; i++) {
+ if (pols[i] == NULL)
+ continue;
+
+ if (nrs == NULL) {
+ nrs = pols[i]->pol_nrs;
+ spin_lock(&nrs->nrs_lock);
+ }
+ nrs_policy_put_locked(pols[i]);
+ }
+
+ if (nrs != NULL)
+ spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek when set, signifies that we just want to examine the
+ * request, and not handle it, so the request is not removed
+ * from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ * has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request *nrs_request_get(struct ptlrpc_nrs_policy *policy,
+ bool peek, bool force)
+{
+ struct ptlrpc_nrs_request *nrq;
+
+ LASSERT(policy->pol_req_queued > 0);
+
+ nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+ LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+ return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+ struct ptlrpc_nrs_policy *policy;
+ int rc;
+ int i;
+
+ /**
+ * Try in descending order, because the primary policy (if any) is
+ * the preferred choice.
+ */
+ for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+ if (nrq->nr_res_ptrs[i] == NULL)
+ continue;
+
+ nrq->nr_res_idx = i;
+ policy = nrq->nr_res_ptrs[i]->res_policy;
+
+ rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+ if (rc == 0) {
+ policy->pol_nrs->nrs_req_queued++;
+ policy->pol_req_queued++;
+ return;
+ }
+ }
+ /**
+ * Should never get here, as at least the primary policy's
+ * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+ * succeed.
+ */
+ LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ * job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+ struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+ if (policy->pol_desc->pd_ops->op_req_stop)
+ policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+ LASSERT(policy->pol_nrs->nrs_req_started > 0);
+ LASSERT(policy->pol_req_started > 0);
+
+ policy->pol_nrs->nrs_req_started--;
+ policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in] nrs the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ * ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in] opc the opcode of the operation being carried out.
+ * \param[in,out] arg can be used to pass information in and out between when
+ * carrying an operation; usually data that is private to
+ * the policy at some level, or generic policy status
+ * information.
+ *
+ * \retval -ve error condition
+ * \retval 0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+ enum ptlrpc_nrs_ctl opc, void *arg)
+{
+ struct ptlrpc_nrs_policy *policy;
+ int rc = 0;
+
+ spin_lock(&nrs->nrs_lock);
+
+ policy = nrs_policy_find_locked(nrs, name);
+ if (policy == NULL) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ switch (opc) {
+ /**
+ * Unknown opcode, pass it down to the policy-specific control
+ * function for handling.
+ */
+ default:
+ rc = nrs_policy_ctl_locked(policy, opc, arg);
+ break;
+
+ /**
+ * Start \e policy
+ */
+ case PTLRPC_NRS_CTL_START:
+ rc = nrs_policy_start_locked(policy);
+ break;
+ }
+out:
+ if (policy != NULL)
+ nrs_policy_put_locked(policy);
+
+ spin_unlock(&nrs->nrs_lock);
+
+ return rc;
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ * ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval 0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+ struct ptlrpc_nrs_policy *policy = NULL;
+
+ spin_lock(&nrs->nrs_lock);
+
+ policy = nrs_policy_find_locked(nrs, name);
+ if (policy == NULL) {
+ spin_unlock(&nrs->nrs_lock);
+
+ CERROR("Can't find NRS policy %s\n", name);
+ return -ENOENT;
+ }
+
+ if (policy->pol_ref > 1) {
+ CERROR("Policy %s is busy with %d references\n", name,
+ (int)policy->pol_ref);
+ nrs_policy_put_locked(policy);
+
+ spin_unlock(&nrs->nrs_lock);
+ return -EBUSY;
+ }
+
+ LASSERT(policy->pol_req_queued == 0);
+ LASSERT(policy->pol_req_started == 0);
+
+ if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+ nrs_policy_stop_locked(policy);
+ LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+ }
+
+ list_del(&policy->pol_list);
+ nrs->nrs_num_pols--;
+
+ nrs_policy_put_locked(policy);
+
+ spin_unlock(&nrs->nrs_lock);
+
+ nrs_policy_fini(policy);
+
+ LASSERT(policy->pol_private == NULL);
+ OBD_FREE_PTR(policy);
+
+ return 0;
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs the NRS head on which the policy will be registered.
+ * \param[in] desc the policy descriptor from which the information will be
+ * obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval 0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+ struct ptlrpc_nrs_pol_desc *desc)
+{
+ struct ptlrpc_nrs_policy *policy;
+ struct ptlrpc_nrs_policy *tmp;
+ struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt;
+ int rc;
+
+ LASSERT(svcpt != NULL);
+ LASSERT(desc->pd_ops != NULL);
+ LASSERT(desc->pd_ops->op_res_get != NULL);
+ LASSERT(desc->pd_ops->op_req_get != NULL);
+ LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+ LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+ LASSERT(desc->pd_compat != NULL);
+
+ OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+ svcpt->scp_cpt, sizeof(*policy), GFP_NOFS);
+ if (policy == NULL)
+ return -ENOMEM;
+
+ policy->pol_nrs = nrs;
+ policy->pol_desc = desc;
+ policy->pol_state = NRS_POL_STATE_STOPPED;
+ policy->pol_flags = desc->pd_flags;
+
+ INIT_LIST_HEAD(&policy->pol_list);
+ INIT_LIST_HEAD(&policy->pol_list_queued);
+
+ rc = nrs_policy_init(policy);
+ if (rc != 0) {
+ OBD_FREE_PTR(policy);
+ return rc;
+ }
+
+ spin_lock(&nrs->nrs_lock);
+
+ tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+ if (tmp != NULL) {
+ CERROR("NRS policy %s has been registered, can't register it for %s\n",
+ policy->pol_desc->pd_name,
+ svcpt->scp_service->srv_name);
+ nrs_policy_put_locked(tmp);
+
+ spin_unlock(&nrs->nrs_lock);
+ nrs_policy_fini(policy);
+ OBD_FREE_PTR(policy);
+
+ return -EEXIST;
+ }
+
+ list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+ nrs->nrs_num_pols++;
+
+ if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+ rc = nrs_policy_start_locked(policy);
+
+ spin_unlock(&nrs->nrs_lock);
+
+ if (rc != 0)
+ (void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+ return rc;
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+ struct ptlrpc_nrs_policy *policy;
+
+ LASSERT(req->rq_nrq.nr_initialized);
+ LASSERT(!req->rq_nrq.nr_enqueued);
+
+ nrs_request_enqueue(&req->rq_nrq);
+ req->rq_nrq.nr_enqueued = 1;
+
+ policy = nrs_request_policy(&req->rq_nrq);
+ /**
+ * Add the policy to the NRS head's list of policies with enqueued
+ * requests, if it has not been added there.
+ */
+ if (unlikely(list_empty(&policy->pol_list_queued)))
+ list_add_tail(&policy->pol_list_queued,
+ &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+ int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+ spin_lock(&req->rq_lock);
+ req->rq_hp = 1;
+ ptlrpc_nrs_req_add_nolock(req);
+ if (opc != OBD_PING)
+ DEBUG_REQ(D_NET, req, "high priority req");
+ spin_unlock(&req->rq_lock);
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+ const struct ptlrpc_nrs_pol_desc *desc)
+{
+ return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval 0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+ struct ptlrpc_nrs_pol_desc *desc;
+ /* for convenience */
+ struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt;
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ int rc = -EINVAL;
+
+ LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+ list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+ if (nrs_policy_compatible(svc, desc)) {
+ rc = nrs_policy_register(nrs, desc);
+ if (rc != 0) {
+ CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n",
+ desc->pd_name, svcpt->scp_cpt,
+ svc->srv_name, rc);
+ /**
+ * Fail registration if any of the policies'
+ * registration fails.
+ */
+ break;
+ }
+ }
+ }
+
+ return rc;
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval 0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+ struct ptlrpc_service_part *svcpt)
+{
+ enum ptlrpc_nrs_queue_type queue;
+
+ LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+ if (nrs == &svcpt->scp_nrs_reg)
+ queue = PTLRPC_NRS_QUEUE_REG;
+ else if (nrs == svcpt->scp_nrs_hp)
+ queue = PTLRPC_NRS_QUEUE_HP;
+ else
+ LBUG();
+
+ nrs->nrs_svcpt = svcpt;
+ nrs->nrs_queue_type = queue;
+ spin_lock_init(&nrs->nrs_lock);
+ INIT_LIST_HEAD(&nrs->nrs_policy_list);
+ INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+
+ return nrs_register_policies_locked(nrs);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_nrs *nrs;
+ int rc;
+
+ LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+ /**
+ * Initialize the regular NRS head.
+ */
+ nrs = nrs_svcpt2nrs(svcpt, false);
+ rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+ if (rc < 0)
+ goto out;
+
+ /**
+ * Optionally allocate a high-priority NRS head.
+ */
+ if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+ goto out;
+
+ OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+ svcpt->scp_service->srv_cptable,
+ svcpt->scp_cpt);
+ if (svcpt->scp_nrs_hp == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ nrs = nrs_svcpt2nrs(svcpt, true);
+ rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+ return rc;
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_nrs *nrs;
+ struct ptlrpc_nrs_policy *policy;
+ struct ptlrpc_nrs_policy *tmp;
+ int rc;
+ bool hp = false;
+
+ LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+ nrs = nrs_svcpt2nrs(svcpt, hp);
+ nrs->nrs_stopping = 1;
+
+ list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+ pol_list) {
+ rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+ LASSERT(rc == 0);
+ }
+
+ /**
+ * If the service partition has an HP NRS head, clean that up as well.
+ */
+ if (!hp && nrs_svcpt_has_hp(svcpt)) {
+ hp = true;
+ goto again;
+ }
+
+ if (hp)
+ OBD_FREE_PTR(nrs);
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+ struct ptlrpc_nrs_pol_desc *tmp;
+
+ list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+ if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+ return tmp;
+ }
+ return NULL;
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval 0 successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+ struct ptlrpc_nrs *nrs;
+ struct ptlrpc_service *svc;
+ struct ptlrpc_service_part *svcpt;
+ int i;
+ int rc = 0;
+
+ LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+ LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+ list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+ if (!nrs_policy_compatible(svc, desc) ||
+ unlikely(svc->srv_is_stopping))
+ continue;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ bool hp = false;
+
+again:
+ nrs = nrs_svcpt2nrs(svcpt, hp);
+ rc = nrs_policy_unregister(nrs, desc->pd_name);
+ /**
+ * Ignore -ENOENT as the policy may not have registered
+ * successfully on all service partitions.
+ */
+ if (rc == -ENOENT) {
+ rc = 0;
+ } else if (rc != 0) {
+ CERROR("Failed to unregister NRS policy %s for partition %d of service %s: %d\n",
+ desc->pd_name, svcpt->scp_cpt,
+ svcpt->scp_service->srv_name, rc);
+ return rc;
+ }
+
+ if (!hp && nrs_svc_has_hp(svc)) {
+ hp = true;
+ goto again;
+ }
+ }
+
+ if (desc->pd_ops->op_lprocfs_fini != NULL)
+ desc->pd_ops->op_lprocfs_fini(svc);
+ }
+
+ return rc;
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ * time when registering a policy that ships with NRS core, or in a
+ * module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval 0 success
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+ struct ptlrpc_service *svc;
+ struct ptlrpc_nrs_pol_desc *desc;
+ int rc = 0;
+
+ LASSERT(conf != NULL);
+ LASSERT(conf->nc_ops != NULL);
+ LASSERT(conf->nc_compat != NULL);
+ LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+ conf->nc_compat_svc_name != NULL));
+ LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+ conf->nc_owner != NULL));
+
+ conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+ /**
+ * External policies are not allowed to start immediately upon
+ * registration, as there is a relatively higher chance that their
+ * registration might fail. In such a case, some policy instances may
+ * already have requests queued wen unregistration needs to happen as
+ * part o cleanup; since there is currently no way to drain requests
+ * from a policy unless the service is unregistering, we just disallow
+ * this.
+ */
+ if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+ (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+ PTLRPC_NRS_FL_REG_START))) {
+ CERROR("NRS: failing to register policy %s. Please check policy flags; external policies cannot act as fallback policies, or be started immediately upon registration without interaction with lprocfs\n",
+ conf->nc_name);
+ return -EINVAL;
+ }
+
+ mutex_lock(&nrs_core.nrs_mutex);
+
+ if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+ CERROR("NRS: failing to register policy %s which has already been registered with NRS core!\n",
+ conf->nc_name);
+ rc = -EEXIST;
+ goto fail;
+ }
+
+ OBD_ALLOC_PTR(desc);
+ if (desc == NULL) {
+ rc = -ENOMEM;
+ goto fail;
+ }
+
+ strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX);
+ desc->pd_ops = conf->nc_ops;
+ desc->pd_compat = conf->nc_compat;
+ desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+ if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+ desc->pd_owner = conf->nc_owner;
+ desc->pd_flags = conf->nc_flags;
+ atomic_set(&desc->pd_refs, 0);
+
+ /**
+ * For policies that are held in the same module as NRS (currently
+ * ptlrpc), do not register the policy with all compatible services,
+ * as the services will not have started at this point, since we are
+ * calling from ptlrpc module initialization code. In such cases each
+ * service will register all compatible policies later, via
+ * ptlrpc_service_nrs_setup().
+ */
+ if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+ goto internal;
+
+ /**
+ * Register the new policy on all compatible services
+ */
+ mutex_lock(&ptlrpc_all_services_mutex);
+
+ list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+ struct ptlrpc_service_part *svcpt;
+ int i;
+ int rc2;
+
+ if (!nrs_policy_compatible(svc, desc) ||
+ unlikely(svc->srv_is_stopping))
+ continue;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ struct ptlrpc_nrs *nrs;
+ bool hp = false;
+again:
+ nrs = nrs_svcpt2nrs(svcpt, hp);
+ rc = nrs_policy_register(nrs, desc);
+ if (rc != 0) {
+ CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n",
+ desc->pd_name, svcpt->scp_cpt,
+ svcpt->scp_service->srv_name, rc);
+
+ rc2 = nrs_policy_unregister_locked(desc);
+ /**
+ * Should not fail at this point
+ */
+ LASSERT(rc2 == 0);
+ mutex_unlock(&ptlrpc_all_services_mutex);
+ OBD_FREE_PTR(desc);
+ goto fail;
+ }
+
+ if (!hp && nrs_svc_has_hp(svc)) {
+ hp = true;
+ goto again;
+ }
+ }
+
+ /**
+ * No need to take a reference to other modules here, as we
+ * will be calling from the module's init() function.
+ */
+ if (desc->pd_ops->op_lprocfs_init != NULL) {
+ rc = desc->pd_ops->op_lprocfs_init(svc);
+ if (rc != 0) {
+ rc2 = nrs_policy_unregister_locked(desc);
+ /**
+ * Should not fail at this point
+ */
+ LASSERT(rc2 == 0);
+ mutex_unlock(&ptlrpc_all_services_mutex);
+ OBD_FREE_PTR(desc);
+ goto fail;
+ }
+ }
+ }
+
+ mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+ list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+ mutex_unlock(&nrs_core.nrs_mutex);
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_register);
+
+/**
+ * Unregisters a previously registered policy with NRS core. All instances of
+ * the policy on all NRS heads of all supported services are removed.
+ *
+ * N.B. This function should only be called from a module's exit() function.
+ * Although it can be used for policies that ship alongside NRS core, the
+ * function is primarily intended for policies that register externally,
+ * from other modules.
+ *
+ * \param[in] conf configuration information for the policy to unregister
+ *
+ * \retval -ve error
+ * \retval 0 success
+ */
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf)
+{
+ struct ptlrpc_nrs_pol_desc *desc;
+ int rc;
+
+ LASSERT(conf != NULL);
+
+ if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) {
+ CERROR("Unable to unregister a fallback policy, unless the PTLRPC service is stopping.\n");
+ return -EPERM;
+ }
+
+ conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+ mutex_lock(&nrs_core.nrs_mutex);
+
+ desc = nrs_policy_find_desc_locked(conf->nc_name);
+ if (desc == NULL) {
+ CERROR("Failing to unregister NRS policy %s which has not been registered with NRS core!\n",
+ conf->nc_name);
+ rc = -ENOENT;
+ goto not_exist;
+ }
+
+ mutex_lock(&ptlrpc_all_services_mutex);
+
+ rc = nrs_policy_unregister_locked(desc);
+ if (rc < 0) {
+ if (rc == -EBUSY)
+ CERROR("Please first stop policy %s on all service partitions and then retry to unregister the policy.\n",
+ conf->nc_name);
+ goto fail;
+ }
+
+ CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n",
+ conf->nc_name);
+
+ list_del(&desc->pd_list);
+ OBD_FREE_PTR(desc);
+
+fail:
+ mutex_unlock(&ptlrpc_all_services_mutex);
+
+not_exist:
+ mutex_unlock(&nrs_core.nrs_mutex);
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister);
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from within ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ * ptlrpc_service_nrs_cleanup() to undo any work performed
+ * by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ const struct ptlrpc_nrs_pol_desc *desc;
+ int i;
+ int rc = 0;
+
+ mutex_lock(&nrs_core.nrs_mutex);
+
+ /**
+ * Initialize NRS heads on all service CPTs.
+ */
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ rc = nrs_svcpt_setup_locked(svcpt);
+ if (rc != 0)
+ goto failed;
+ }
+
+ /**
+ * Set up lprocfs interfaces for all supported policies for the
+ * service.
+ */
+ list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+ if (!nrs_policy_compatible(svc, desc))
+ continue;
+
+ if (desc->pd_ops->op_lprocfs_init != NULL) {
+ rc = desc->pd_ops->op_lprocfs_init(svc);
+ if (rc != 0)
+ goto failed;
+ }
+ }
+
+failed:
+
+ mutex_unlock(&nrs_core.nrs_mutex);
+
+ return rc;
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ const struct ptlrpc_nrs_pol_desc *desc;
+ int i;
+
+ mutex_lock(&nrs_core.nrs_mutex);
+
+ /**
+ * Clean up NRS heads on all service partitions
+ */
+ ptlrpc_service_for_each_part(svcpt, i, svc)
+ nrs_svcpt_cleanup_locked(svcpt);
+
+ /**
+ * Clean up lprocfs interfaces for all supported policies for the
+ * service.
+ */
+ list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+ if (!nrs_policy_compatible(svc, desc))
+ continue;
+
+ if (desc->pd_ops->op_lprocfs_fini != NULL)
+ desc->pd_ops->op_lprocfs_fini(svc);
+ }
+
+ mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req the request
+ * \param[in] hp which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req, bool hp)
+{
+ struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+ memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+ nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+ false);
+
+ /**
+ * It is fine to access \e nr_initialized without locking as there is
+ * no contention at this early stage.
+ */
+ req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+ if (req->rq_nrq.nr_initialized) {
+ nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+ /* no protection on bit nr_initialized because no
+ * contention at this late stage */
+ req->rq_nrq.nr_finalized = 1;
+ }
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+ if (req->rq_nrq.nr_started)
+ nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req the request to be enqueued
+ * \param[in] hp whether to enqueue the request on the regular or
+ * high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req, bool hp)
+{
+ spin_lock(&svcpt->scp_req_lock);
+
+ if (hp)
+ ptlrpc_nrs_hpreq_add_nolock(req);
+ else
+ ptlrpc_nrs_req_add_nolock(req);
+
+ spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+ LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+ LASSERT(policy->pol_req_queued > 0);
+
+ policy->pol_nrs->nrs_req_queued--;
+ policy->pol_req_queued--;
+
+ /**
+ * If the policy has no more requests queued, remove it from
+ * ptlrpc_nrs::nrs_policy_queued.
+ */
+ if (unlikely(policy->pol_req_queued == 0)) {
+ list_del_init(&policy->pol_list_queued);
+
+ /**
+ * If there are other policies with queued requests, move the
+ * current policy to the end so that we can round robin over
+ * all policies and drain the requests.
+ */
+ } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+ LASSERT(policy->pol_req_queued <
+ policy->pol_nrs->nrs_req_queued);
+
+ list_move_tail(&policy->pol_list_queued,
+ &policy->pol_nrs->nrs_policy_queued);
+ }
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp whether to obtain a request from the regular or
+ * high-priority NRS head.
+ * \param[in] peek when set, signifies that we just want to examine the
+ * request, and not handle it, so the request is not removed
+ * from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ * has one pending
+ *
+ * \retval the request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+ bool peek, bool force)
+{
+ struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+ struct ptlrpc_nrs_policy *policy;
+ struct ptlrpc_nrs_request *nrq;
+
+ /**
+ * Always try to drain requests from all NRS polices even if they are
+ * inactive, because the user can change policy status at runtime.
+ */
+ list_for_each_entry(policy, &nrs->nrs_policy_queued,
+ pol_list_queued) {
+ nrq = nrs_request_get(policy, peek, force);
+ if (nrq != NULL) {
+ if (likely(!peek)) {
+ nrq->nr_started = 1;
+
+ policy->pol_req_started++;
+ policy->pol_nrs->nrs_req_started++;
+
+ nrs_request_removed(policy);
+ }
+
+ return container_of(nrq, struct ptlrpc_request, rq_nrq);
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+ struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+ policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+ req->rq_nrq.nr_enqueued = 0;
+
+ nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp whether the regular or high-priority NRS head is to be
+ * enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+ struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+ return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+ struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+ struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+ struct ptlrpc_nrs_resource *res1[NRS_RES_MAX];
+ struct ptlrpc_nrs_resource *res2[NRS_RES_MAX];
+
+ /**
+ * Obtain the high-priority NRS head resources.
+ */
+ nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+ spin_lock(&svcpt->scp_req_lock);
+
+ if (!ptlrpc_nrs_req_can_move(req))
+ goto out;
+
+ ptlrpc_nrs_req_del_nolock(req);
+
+ memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+ memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+ ptlrpc_nrs_hpreq_add_nolock(req);
+
+ memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+ spin_unlock(&svcpt->scp_req_lock);
+
+ /**
+ * Release either the regular NRS head resources if we moved the
+ * request, or the high-priority NRS head resources if we took a
+ * reference earlier in this function and ptlrpc_nrs_req_can_move()
+ * returned false.
+ */
+ nrs_resource_put_safe(res1);
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in] svc the service the policy belongs to.
+ * \param[in] queue whether to carry out the command on the policy which
+ * belongs to the regular, high-priority, or both NRS
+ * heads of service partitions of \a svc.
+ * \param[in] name the policy to act upon, by human-readable name
+ * \param[in] opc the opcode of the operation to carry out
+ * \param[in] single when set, the operation will only be carried out on the
+ * NRS heads of the first service partition of \a svc.
+ * This is useful for some policies which e.g. share
+ * identical values on the same parameters of different
+ * service partitions; when reading these parameters via
+ * lprocfs, these policies may just want to obtain and
+ * print out the values from the first service partition.
+ * Storing these values centrally elsewhere then could be
+ * another solution for this.
+ * \param[in,out] arg can be used as a generic in/out buffer between control
+ * operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval 0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+ enum ptlrpc_nrs_queue_type queue, char *name,
+ enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+ struct ptlrpc_service_part *svcpt;
+ int i;
+ int rc = 0;
+
+ LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+ if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+ return -EINVAL;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+ rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+ opc, arg);
+ if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+ single))
+ goto out;
+ }
+
+ if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+ /**
+ * XXX: We could optionally check for
+ * nrs_svc_has_hp(svc) here, and return an error if it
+ * is false. Right now we rely on the policies' lprocfs
+ * handlers that call the present function to make this
+ * check; if they fail to do so, they might hit the
+ * assertion inside nrs_svcpt2nrs() below.
+ */
+ rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+ opc, arg);
+ if (rc != 0 || single)
+ goto out;
+ }
+ }
+out:
+ return rc;
+}
+
+
+/* ptlrpc/nrs_fifo.c */
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+ int rc;
+
+ mutex_init(&nrs_core.nrs_mutex);
+ INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+ rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+ if (rc != 0)
+ goto fail;
+
+
+ return rc;
+fail:
+ /**
+ * Since no PTLRPC services have been started at this point, all we need
+ * to do for cleanup is to free the descriptors.
+ */
+ ptlrpc_nrs_fini();
+
+ return rc;
+}
+
+/**
+ * Removes all policy descriptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+ struct ptlrpc_nrs_pol_desc *desc;
+ struct ptlrpc_nrs_pol_desc *tmp;
+
+ list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+ pd_list) {
+ list_del_init(&desc->pd_list);
+ OBD_FREE_PTR(desc);
+ }
+}
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
new file mode 100644
index 000000000..eb40c01db
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
@@ -0,0 +1,270 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../../include/linux/libcfs/libcfs.h"
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO "fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval 0 success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy)
+{
+ struct nrs_fifo_head *head;
+
+ OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+ if (head == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&head->fh_list);
+ policy->pol_private = head;
+ return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+ struct nrs_fifo_head *head = policy->pol_private;
+
+ LASSERT(head != NULL);
+ LASSERT(list_empty(&head->fh_list));
+
+ OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in] policy The policy on which the request is being asked for
+ * \param[in] nrq The request for which resources are being taken
+ * \param[in] parent Parent resource, unused in this policy
+ * \param[out] resp Resources references are placed in this array
+ * \param[in] moving_req Signifies limited caller context; unused in this
+ * policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ * it implements a simple scheduling algorithm in which request
+ * priority is determined on the request arrival order, it does not
+ * need to maintain a set of resources that would otherwise be used
+ * to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq,
+ const struct ptlrpc_nrs_resource *parent,
+ struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+ /**
+ * Just return the resource embedded inside nrs_fifo_head, and end this
+ * resource hierarchy reference request.
+ */
+ *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+ return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek When set, signifies that we just want to examine the
+ * request, and not handle it, so the request is not removed
+ * from the policy.
+ * \param[in] force Force the policy to return a request; unused in this
+ * policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ * queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request *nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+ bool peek, bool force)
+{
+ struct nrs_fifo_head *head = policy->pol_private;
+ struct ptlrpc_nrs_request *nrq;
+
+ nrq = unlikely(list_empty(&head->fh_list)) ? NULL :
+ list_entry(head->fh_list.next, struct ptlrpc_nrs_request,
+ nr_u.fifo.fr_list);
+
+ if (likely(!peek && nrq != NULL)) {
+ struct ptlrpc_request *req = container_of(nrq,
+ struct ptlrpc_request,
+ rq_nrq);
+
+ list_del_init(&nrq->nr_u.fifo.fr_list);
+
+ CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu\n",
+ policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+ nrq->nr_u.fifo.fr_sequence);
+ }
+
+ return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ * succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq)
+{
+ struct nrs_fifo_head *head;
+
+ head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+ fh_res);
+ /**
+ * Only used for debugging
+ */
+ nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+ list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+ return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq)
+{
+ LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+ list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+ struct ptlrpc_nrs_request *nrq)
+{
+ struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+ rq_nrq);
+
+ CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n",
+ policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+ nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+ .op_policy_start = nrs_fifo_start,
+ .op_policy_stop = nrs_fifo_stop,
+ .op_res_get = nrs_fifo_res_get,
+ .op_req_get = nrs_fifo_req_get,
+ .op_req_enqueue = nrs_fifo_req_add,
+ .op_req_dequeue = nrs_fifo_req_del,
+ .op_req_stop = nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+ .nc_name = NRS_POL_NAME_FIFO,
+ .nc_ops = &nrs_fifo_ops,
+ .nc_compat = nrs_policy_compat_all,
+ .nc_flags = PTLRPC_NRS_FL_FALLBACK |
+ PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
new file mode 100644
index 000000000..b51af9bf3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
@@ -0,0 +1,2536 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/obd_cksum.h"
+#include "../include/lustre/ll_fiemap.h"
+
+static inline int lustre_msg_hdr_size_v2(int count)
+{
+ return cfs_size_round(offsetof(struct lustre_msg_v2,
+ lm_buflens[count]));
+}
+
+int lustre_msg_hdr_size(__u32 magic, int count)
+{
+ switch (magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_msg_hdr_size_v2(count);
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", magic);
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_hdr_size);
+
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+ int index)
+{
+ if (inout)
+ lustre_set_req_swabbed(req, index);
+ else
+ lustre_set_rep_swabbed(req, index);
+}
+EXPORT_SYMBOL(ptlrpc_buf_set_swabbed);
+
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+ int index)
+{
+ if (inout)
+ return (ptlrpc_req_need_swab(req) &&
+ !lustre_req_swabbed(req, index));
+ else
+ return (ptlrpc_rep_need_swab(req) &&
+ !lustre_rep_swabbed(req, index));
+}
+EXPORT_SYMBOL(ptlrpc_buf_need_swab);
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+ __u32 version)
+{
+ __u32 ver = lustre_msg_get_version(msg);
+ return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ CERROR("msg v1 not supported - please upgrade you system\n");
+ return -EINVAL;
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_msg_check_version_v2(msg, version);
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_check_version);
+
+/* early reply size */
+int lustre_msg_early_size(void)
+{
+ static int size;
+ if (!size) {
+ /* Always reply old ptlrpc_body_v2 to keep interoperability
+ * with the old client (< 2.3) which doesn't have pb_jobid
+ * in the ptlrpc_body.
+ *
+ * XXX Remove this whenever we drop interoperability with such
+ * client.
+ */
+ __u32 pblen = sizeof(struct ptlrpc_body_v2);
+ size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+ }
+ return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+int lustre_msg_size_v2(int count, __u32 *lengths)
+{
+ int size;
+ int i;
+
+ size = lustre_msg_hdr_size_v2(count);
+ for (i = 0; i < count; i++)
+ size += cfs_size_round(lengths[i]);
+
+ return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/* This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ * in the form of a v2 request. If this is a connection to a v1
+ * target then the first buffer will be stripped because the ptlrpc
+ * data is part of the lustre_msg_v1 header. b=14043 */
+int lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+ __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+ if (!lens) {
+ LASSERT(count == 1);
+ lens = size;
+ }
+
+ LASSERT(count > 0);
+ LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+ switch (magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_msg_size_v2(count, lens);
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", magic);
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_size);
+
+/* This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats. */
+int lustre_packed_msg_size(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_packed_msg_size);
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+ char **bufs)
+{
+ char *ptr;
+ int i;
+
+ msg->lm_bufcount = count;
+ /* XXX: lm_secflvr uninitialized here */
+ msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+ for (i = 0; i < count; i++)
+ msg->lm_buflens[i] = lens[i];
+
+ if (bufs == NULL)
+ return;
+
+ ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+ for (i = 0; i < count; i++) {
+ char *tmp = bufs[i];
+ LOGL(tmp, lens[i], ptr);
+ }
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+ int count, __u32 *lens, char **bufs)
+{
+ int reqlen, rc;
+
+ reqlen = lustre_msg_size_v2(count, lens);
+
+ rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+ if (rc)
+ return rc;
+
+ req->rq_reqlen = reqlen;
+
+ lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+ lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+ return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+ __u32 *lens, char **bufs)
+{
+ __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+ if (!lens) {
+ LASSERT(count == 1);
+ lens = size;
+ }
+
+ LASSERT(count > 0);
+ LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+ /* only use new format, we don't need to be compatible with 1.4 */
+ return lustre_pack_request_v2(req, count, lens, bufs);
+}
+EXPORT_SYMBOL(lustre_pack_request);
+
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \
+do { \
+ spin_lock(&ptlrpc_rs_debug_lock); \
+ list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \
+ spin_unlock(&ptlrpc_rs_debug_lock); \
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \
+do { \
+ spin_lock(&ptlrpc_rs_debug_lock); \
+ list_del(&(rs)->rs_debug_list); \
+ spin_unlock(&ptlrpc_rs_debug_lock); \
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while (0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while (0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_reply_state *rs = NULL;
+
+ spin_lock(&svcpt->scp_rep_lock);
+
+ /* See if we have anything in a pool, and wait if nothing */
+ while (list_empty(&svcpt->scp_rep_idle)) {
+ struct l_wait_info lwi;
+ int rc;
+
+ spin_unlock(&svcpt->scp_rep_lock);
+ /* If we cannot get anything for some long time, we better
+ * bail out instead of waiting infinitely */
+ lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL);
+ rc = l_wait_event(svcpt->scp_rep_waitq,
+ !list_empty(&svcpt->scp_rep_idle), &lwi);
+ if (rc != 0)
+ goto out;
+ spin_lock(&svcpt->scp_rep_lock);
+ }
+
+ rs = list_entry(svcpt->scp_rep_idle.next,
+ struct ptlrpc_reply_state, rs_list);
+ list_del(&rs->rs_list);
+
+ spin_unlock(&svcpt->scp_rep_lock);
+
+ memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+ rs->rs_size = svcpt->scp_service->srv_max_reply_size;
+ rs->rs_svcpt = svcpt;
+ rs->rs_prealloc = 1;
+out:
+ return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+ struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+ spin_lock(&svcpt->scp_rep_lock);
+ list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+ spin_unlock(&svcpt->scp_rep_lock);
+ wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+ __u32 *lens, char **bufs, int flags)
+{
+ struct ptlrpc_reply_state *rs;
+ int msg_len, rc;
+
+ LASSERT(req->rq_reply_state == NULL);
+
+ if ((flags & LPRFL_EARLY_REPLY) == 0) {
+ spin_lock(&req->rq_lock);
+ req->rq_packed_final = 1;
+ spin_unlock(&req->rq_lock);
+ }
+
+ msg_len = lustre_msg_size_v2(count, lens);
+ rc = sptlrpc_svc_alloc_rs(req, msg_len);
+ if (rc)
+ return rc;
+
+ rs = req->rq_reply_state;
+ atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */
+ rs->rs_cb_id.cbid_fn = reply_out_callback;
+ rs->rs_cb_id.cbid_arg = rs;
+ rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+ INIT_LIST_HEAD(&rs->rs_exp_list);
+ INIT_LIST_HEAD(&rs->rs_obd_list);
+ INIT_LIST_HEAD(&rs->rs_list);
+ spin_lock_init(&rs->rs_lock);
+
+ req->rq_replen = msg_len;
+ req->rq_reply_state = rs;
+ req->rq_repmsg = rs->rs_msg;
+
+ lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+ lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+ PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+ return 0;
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+ char **bufs, int flags)
+{
+ int rc = 0;
+ __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+ if (!lens) {
+ LASSERT(count == 1);
+ lens = size;
+ }
+
+ LASSERT(count > 0);
+ LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+ switch (req->rq_reqmsg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+ break;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n",
+ req->rq_reqmsg->lm_magic);
+ rc = -EINVAL;
+ }
+ if (rc != 0)
+ CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+ lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+ return rc;
+}
+EXPORT_SYMBOL(lustre_pack_reply_flags);
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+ char **bufs)
+{
+ return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
+{
+ int i, offset, buflen, bufcount;
+
+ LASSERT(m != NULL);
+ LASSERT(n >= 0);
+
+ bufcount = m->lm_bufcount;
+ if (unlikely(n >= bufcount)) {
+ CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+ m, n, bufcount);
+ return NULL;
+ }
+
+ buflen = m->lm_buflens[n];
+ if (unlikely(buflen < min_size)) {
+ CERROR("msg %p buffer[%d] size %d too small (required %d, opc=%d)\n",
+ m, n, buflen, min_size,
+ n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+ return NULL;
+ }
+
+ offset = lustre_msg_hdr_size_v2(bufcount);
+ for (i = 0; i < n; i++)
+ offset += cfs_size_round(m->lm_buflens[i]);
+
+ return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
+{
+ switch (m->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_msg_buf_v2(m, n, min_size);
+ default:
+ LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m);
+ return NULL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment,
+ unsigned int newlen, int move_data)
+{
+ char *tail = NULL, *newpos;
+ int tail_len = 0, n;
+
+ LASSERT(msg);
+ LASSERT(msg->lm_bufcount > segment);
+ LASSERT(msg->lm_buflens[segment] >= newlen);
+
+ if (msg->lm_buflens[segment] == newlen)
+ goto out;
+
+ if (move_data && msg->lm_bufcount > segment + 1) {
+ tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+ for (n = segment + 1; n < msg->lm_bufcount; n++)
+ tail_len += cfs_size_round(msg->lm_buflens[n]);
+ }
+
+ msg->lm_buflens[segment] = newlen;
+
+ if (tail && tail_len) {
+ newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+ LASSERT(newpos <= tail);
+ if (newpos != tail)
+ memmove(newpos, tail, tail_len);
+ }
+out:
+ return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ * with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ * after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+ unsigned int newlen, int move_data)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+ PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+ LASSERT(atomic_read(&rs->rs_refcount) == 0);
+ LASSERT(!rs->rs_difficult || rs->rs_handled);
+ LASSERT(!rs->rs_on_net);
+ LASSERT(!rs->rs_scheduled);
+ LASSERT(rs->rs_export == NULL);
+ LASSERT(rs->rs_nlocks == 0);
+ LASSERT(list_empty(&rs->rs_exp_list));
+ LASSERT(list_empty(&rs->rs_obd_list));
+
+ sptlrpc_svc_free_rs(rs);
+}
+EXPORT_SYMBOL(lustre_free_reply_state);
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+ int swabbed, required_len, i;
+
+ /* Now we know the sender speaks my language. */
+ required_len = lustre_msg_hdr_size_v2(0);
+ if (len < required_len) {
+ /* can't even look inside the message */
+ CERROR("message length %d too small for lustre_msg\n", len);
+ return -EINVAL;
+ }
+
+ swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+ if (swabbed) {
+ __swab32s(&m->lm_magic);
+ __swab32s(&m->lm_bufcount);
+ __swab32s(&m->lm_secflvr);
+ __swab32s(&m->lm_repsize);
+ __swab32s(&m->lm_cksum);
+ __swab32s(&m->lm_flags);
+ CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+ CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+ }
+
+ required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+ if (len < required_len) {
+ /* didn't receive all the buffer lengths */
+ CERROR("message length %d too small for %d buflens\n",
+ len, m->lm_bufcount);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < m->lm_bufcount; i++) {
+ if (swabbed)
+ __swab32s(&m->lm_buflens[i]);
+ required_len += cfs_size_round(m->lm_buflens[i]);
+ }
+
+ if (len < required_len) {
+ CERROR("len: %d, required_len %d\n", len, required_len);
+ CERROR("bufcount: %d\n", m->lm_bufcount);
+ for (i = 0; i < m->lm_bufcount; i++)
+ CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+ return -EINVAL;
+ }
+
+ return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+ int required_len, rc;
+
+ /* We can provide a slightly better error log, if we check the
+ * message magic and version first. In the future, struct
+ * lustre_msg may grow, and we'd like to log a version mismatch,
+ * rather than a short message.
+ *
+ */
+ required_len = offsetof(struct lustre_msg, lm_magic) +
+ sizeof(m->lm_magic);
+ if (len < required_len) {
+ /* can't even look inside the message */
+ CERROR("message length %d too small for magic/version check\n",
+ len);
+ return -EINVAL;
+ }
+
+ rc = lustre_unpack_msg_v2(m, len);
+
+ return rc;
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+ int rc;
+ rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+ if (rc == 1) {
+ lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+ rc = 0;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_req_msg);
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+ int rc;
+ rc = __lustre_unpack_msg(req->rq_repmsg, len);
+ if (rc == 1) {
+ lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+ rc = 0;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_rep_msg);
+
+static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+ const int inout, int offset)
+{
+ struct ptlrpc_body *pb;
+ struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg;
+
+ pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+ if (!pb) {
+ CERROR("error unpacking ptlrpc body\n");
+ return -EFAULT;
+ }
+ if (ptlrpc_buf_need_swab(req, inout, offset)) {
+ lustre_swab_ptlrpc_body(pb);
+ ptlrpc_buf_set_swabbed(req, inout, offset);
+ }
+
+ if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+ CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+ return -EINVAL;
+ }
+
+ if (!inout)
+ pb->pb_status = ptlrpc_status_ntoh(pb->pb_status);
+
+ return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+ switch (req->rq_reqmsg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_unpack_ptlrpc_body_v2(req, 1, offset);
+ default:
+ CERROR("bad lustre msg magic: %08x\n",
+ req->rq_reqmsg->lm_magic);
+ return -EINVAL;
+ }
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+ switch (req->rq_repmsg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_unpack_ptlrpc_body_v2(req, 0, offset);
+ default:
+ CERROR("bad lustre msg magic: %08x\n",
+ req->rq_repmsg->lm_magic);
+ return -EINVAL;
+ }
+}
+
+static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n)
+{
+ if (n >= m->lm_bufcount)
+ return 0;
+
+ return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+int lustre_msg_buflen(struct lustre_msg *m, int n)
+{
+ switch (m->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_msg_buflen_v2(m, n);
+ default:
+ CERROR("incorrect message magic: %08x\n", m->lm_magic);
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len)
+{
+ if (n >= m->lm_bufcount)
+ LBUG();
+
+ m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len)
+{
+ switch (m->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ lustre_msg_set_buflen_v2(m, n, len);
+ return;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+ }
+}
+
+EXPORT_SYMBOL(lustre_msg_set_buflen);
+
+/* NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body). */
+int lustre_msg_bufcount(struct lustre_msg *m)
+{
+ switch (m->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return m->lm_bufcount;
+ default:
+ CERROR("incorrect message magic: %08x\n", m->lm_magic);
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_bufcount);
+
+char *lustre_msg_string(struct lustre_msg *m, int index, int max_len)
+{
+ /* max_len == 0 means the string should fill the buffer */
+ char *str;
+ int slen, blen;
+
+ switch (m->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ str = lustre_msg_buf_v2(m, index, 0);
+ blen = lustre_msg_buflen_v2(m, index);
+ break;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+ }
+
+ if (str == NULL) {
+ CERROR("can't unpack string in msg %p buffer[%d]\n", m, index);
+ return NULL;
+ }
+
+ slen = strnlen(str, blen);
+
+ if (slen == blen) { /* not NULL terminated */
+ CERROR("can't unpack non-NULL terminated string in msg %p buffer[%d] len %d\n",
+ m, index, blen);
+ return NULL;
+ }
+
+ if (max_len == 0) {
+ if (slen != blen - 1) {
+ CERROR("can't unpack short string in msg %p buffer[%d] len %d: strlen %d\n",
+ m, index, blen, slen);
+ return NULL;
+ }
+ } else if (slen > max_len) {
+ CERROR("can't unpack oversized string in msg %p buffer[%d] len %d strlen %d: max %d expected\n",
+ m, index, blen, slen, max_len);
+ return NULL;
+ }
+
+ return str;
+}
+EXPORT_SYMBOL(lustre_msg_string);
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index,
+ int min_size, void *swabber)
+{
+ void *ptr = NULL;
+
+ LASSERT(msg != NULL);
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ ptr = lustre_msg_buf_v2(msg, index, min_size);
+ break;
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ }
+
+ if (ptr && swabber)
+ ((void (*)(void *))swabber)(ptr);
+
+ return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+ return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+ sizeof(struct ptlrpc_body_v2));
+}
+
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2:
+ /* already in host endian */
+ return msg->lm_flags;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2:
+ msg->lm_flags = flags;
+ return;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_flags;
+ }
+ default:
+ /* flags might be printed in debug code while message
+ * uninitialized */
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_flags |= flags;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_flags = flags;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_flags);
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_op_flags;
+ }
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_op_flags);
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_op_flags |= flags;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_op_flags |= flags;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return NULL;
+ }
+ return &pb->pb_handle;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return NULL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_handle);
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return PTL_RPC_MSG_ERR;
+ }
+ return pb->pb_type;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return PTL_RPC_MSG_ERR;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_version;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_version);
+
+void lustre_msg_add_version(struct lustre_msg *msg, int version)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_version |= version;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_add_version);
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_opc;
+ }
+ default:
+ CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg);
+ LBUG();
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_last_xid;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_last_committed;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return NULL;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return NULL;
+ }
+ return pb->pb_pre_versions;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return NULL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_transno;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return -EINVAL;
+ }
+ return pb->pb_status;
+ }
+ default:
+ /* status might be printed in debug code while message
+ * uninitialized */
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return -EINVAL;
+ }
+ return pb->pb_slv;
+ }
+ default:
+ CERROR("invalid msg magic %08x\n", msg->lm_magic);
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_slv);
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return;
+ }
+ pb->pb_slv = slv;
+ return;
+ }
+ default:
+ CERROR("invalid msg magic %x\n", msg->lm_magic);
+ return;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_slv);
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return -EINVAL;
+ }
+ return pb->pb_limit;
+ }
+ default:
+ CERROR("invalid msg magic %x\n", msg->lm_magic);
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_limit);
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return;
+ }
+ pb->pb_limit = limit;
+ return;
+ }
+ default:
+ CERROR("invalid msg magic %08x\n", msg->lm_magic);
+ return;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_limit);
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+ }
+ return pb->pb_conn_cnt;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 1;
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_is_v1);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return msg->lm_magic;
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_magic);
+
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+
+ }
+ return pb->pb_timeout;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return 0;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ if (!pb) {
+ CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+ return 0;
+
+ }
+ return pb->pb_service_time;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ case LUSTRE_MSG_MAGIC_V1_SWABBED:
+ return NULL;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb =
+ lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+ sizeof(struct ptlrpc_body));
+ if (!pb)
+ return NULL;
+
+ return pb->pb_jobid;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return NULL;
+ }
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return msg->lm_cksum;
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ __u32 crc;
+ unsigned int hsize = 4;
+ cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+ lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF),
+ NULL, 0, (unsigned char *)&crc, &hsize);
+ return crc;
+ }
+ default:
+ CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+ return 0;
+ }
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_handle = *handle;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_handle);
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_type = type;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_type);
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_opc = opc;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_opc);
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_last_xid = last_xid;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_last_committed = last_committed;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_last_committed);
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_pre_versions[0] = versions[0];
+ pb->pb_pre_versions[1] = versions[1];
+ pb->pb_pre_versions[2] = versions[2];
+ pb->pb_pre_versions[3] = versions[3];
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_transno = transno;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_status = status;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_conn_cnt = conn_cnt;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_conn_cnt);
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_timeout = timeout;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+ pb->pb_service_time = service_time;
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2: {
+ __u32 opc = lustre_msg_get_opc(msg);
+ struct ptlrpc_body *pb;
+
+ /* Don't set jobid for ldlm ast RPCs, they've been shrunk.
+ * See the comment in ptlrpc_request_pack(). */
+ if (!opc || opc == LDLM_BL_CALLBACK ||
+ opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+ return;
+
+ pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+ sizeof(struct ptlrpc_body));
+ LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+ if (jobid != NULL)
+ memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE);
+ else if (pb->pb_jobid[0] == '\0')
+ lustre_get_jobid(pb->pb_jobid);
+ return;
+ }
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+ switch (msg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V1:
+ return;
+ case LUSTRE_MSG_MAGIC_V2:
+ msg->lm_cksum = cksum;
+ return;
+ default:
+ LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+ }
+}
+
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+ int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+ req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+ req->rq_pill.rc_area[RCL_SERVER]);
+ if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+ req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+ req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+ if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+ req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_req_set_repsize);
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+ int opcode, int version,
+ u32 keylen, void *key,
+ u32 vallen, void *val,
+ struct ptlrpc_request_set *set)
+{
+ struct ptlrpc_request *req;
+ char *tmp;
+ int rc;
+
+ req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+ RCL_CLIENT, keylen);
+ req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+ RCL_CLIENT, vallen);
+ rc = ptlrpc_request_pack(req, version, opcode);
+ if (rc) {
+ ptlrpc_request_free(req);
+ return rc;
+ }
+
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+ memcpy(tmp, key, keylen);
+ tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+ memcpy(tmp, val, vallen);
+
+ ptlrpc_request_set_replen(req);
+
+ if (set) {
+ ptlrpc_set_add_req(set, req);
+ ptlrpc_check_set(NULL, set);
+ } else {
+ rc = ptlrpc_queue_wait(req);
+ ptlrpc_req_finished(req);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+ __swab32s(&b->pb_type);
+ __swab32s(&b->pb_version);
+ __swab32s(&b->pb_opc);
+ __swab32s(&b->pb_status);
+ __swab64s(&b->pb_last_xid);
+ __swab64s(&b->pb_last_seen);
+ __swab64s(&b->pb_last_committed);
+ __swab64s(&b->pb_transno);
+ __swab32s(&b->pb_flags);
+ __swab32s(&b->pb_op_flags);
+ __swab32s(&b->pb_conn_cnt);
+ __swab32s(&b->pb_timeout);
+ __swab32s(&b->pb_service_time);
+ __swab32s(&b->pb_limit);
+ __swab64s(&b->pb_slv);
+ __swab64s(&b->pb_pre_versions[0]);
+ __swab64s(&b->pb_pre_versions[1]);
+ __swab64s(&b->pb_pre_versions[2]);
+ __swab64s(&b->pb_pre_versions[3]);
+ CLASSERT(offsetof(typeof(*b), pb_padding) != 0);
+ /* While we need to maintain compatibility between
+ * clients and servers without ptlrpc_body_v2 (< 2.3)
+ * do not swab any fields beyond pb_jobid, as we are
+ * using this swab function for both ptlrpc_body
+ * and ptlrpc_body_v2. */
+ CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_ptlrpc_body);
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+ __swab64s(&ocd->ocd_connect_flags);
+ __swab32s(&ocd->ocd_version);
+ __swab32s(&ocd->ocd_grant);
+ __swab64s(&ocd->ocd_ibits_known);
+ __swab32s(&ocd->ocd_index);
+ __swab32s(&ocd->ocd_brw_size);
+ /* ocd_blocksize and ocd_inodespace don't need to be swabbed because
+ * they are 8-byte values */
+ __swab16s(&ocd->ocd_grant_extent);
+ __swab32s(&ocd->ocd_unused);
+ __swab64s(&ocd->ocd_transno);
+ __swab32s(&ocd->ocd_group);
+ __swab32s(&ocd->ocd_cksum_types);
+ __swab32s(&ocd->ocd_instance);
+ /* Fields after ocd_cksum_types are only accessible by the receiver
+ * if the corresponding flag in ocd_connect_flags is set. Accessing
+ * any field after ocd_maxbytes on the receiver without a valid flag
+ * may result in out-of-bound memory access and kernel oops. */
+ if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+ __swab32s(&ocd->ocd_max_easize);
+ if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+ __swab64s(&ocd->ocd_maxbytes);
+ CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding7) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding8) != 0);
+ CLASSERT(offsetof(typeof(*ocd), padding9) != 0);
+ CLASSERT(offsetof(typeof(*ocd), paddingA) != 0);
+ CLASSERT(offsetof(typeof(*ocd), paddingB) != 0);
+ CLASSERT(offsetof(typeof(*ocd), paddingC) != 0);
+ CLASSERT(offsetof(typeof(*ocd), paddingD) != 0);
+ CLASSERT(offsetof(typeof(*ocd), paddingE) != 0);
+ CLASSERT(offsetof(typeof(*ocd), paddingF) != 0);
+}
+
+void lustre_swab_obdo(struct obdo *o)
+{
+ __swab64s(&o->o_valid);
+ lustre_swab_ost_id(&o->o_oi);
+ __swab64s(&o->o_parent_seq);
+ __swab64s(&o->o_size);
+ __swab64s(&o->o_mtime);
+ __swab64s(&o->o_atime);
+ __swab64s(&o->o_ctime);
+ __swab64s(&o->o_blocks);
+ __swab64s(&o->o_grant);
+ __swab32s(&o->o_blksize);
+ __swab32s(&o->o_mode);
+ __swab32s(&o->o_uid);
+ __swab32s(&o->o_gid);
+ __swab32s(&o->o_flags);
+ __swab32s(&o->o_nlink);
+ __swab32s(&o->o_parent_oid);
+ __swab32s(&o->o_misc);
+ __swab64s(&o->o_ioepoch);
+ __swab32s(&o->o_stripe_idx);
+ __swab32s(&o->o_parent_ver);
+ /* o_handle is opaque */
+ /* o_lcookie is swabbed elsewhere */
+ __swab32s(&o->o_uid_h);
+ __swab32s(&o->o_gid_h);
+ __swab64s(&o->o_data_version);
+ CLASSERT(offsetof(typeof(*o), o_padding_4) != 0);
+ CLASSERT(offsetof(typeof(*o), o_padding_5) != 0);
+ CLASSERT(offsetof(typeof(*o), o_padding_6) != 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs(struct obd_statfs *os)
+{
+ __swab64s(&os->os_type);
+ __swab64s(&os->os_blocks);
+ __swab64s(&os->os_bfree);
+ __swab64s(&os->os_bavail);
+ __swab64s(&os->os_files);
+ __swab64s(&os->os_ffree);
+ /* no need to swab os_fsid */
+ __swab32s(&os->os_bsize);
+ __swab32s(&os->os_namelen);
+ __swab64s(&os->os_maxbytes);
+ __swab32s(&os->os_state);
+ CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+ CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_obd_statfs);
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+ lustre_swab_ost_id(&ioo->ioo_oid);
+ __swab32s(&ioo->ioo_max_brw);
+ __swab32s(&ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(lustre_swab_obd_ioobj);
+
+void lustre_swab_niobuf_remote(struct niobuf_remote *nbr)
+{
+ __swab64s(&nbr->offset);
+ __swab32s(&nbr->len);
+ __swab32s(&nbr->flags);
+}
+EXPORT_SYMBOL(lustre_swab_niobuf_remote);
+
+void lustre_swab_ost_body(struct ost_body *b)
+{
+ lustre_swab_obdo(&b->oa);
+}
+EXPORT_SYMBOL(lustre_swab_ost_body);
+
+void lustre_swab_ost_last_id(u64 *id)
+{
+ __swab64s(id);
+}
+EXPORT_SYMBOL(lustre_swab_ost_last_id);
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+ __swab32s(val);
+}
+EXPORT_SYMBOL(lustre_swab_generic_32s);
+
+void lustre_swab_gl_desc(union ldlm_gl_desc *desc)
+{
+ lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid);
+ __swab64s(&desc->lquota_desc.gl_flags);
+ __swab64s(&desc->lquota_desc.gl_ver);
+ __swab64s(&desc->lquota_desc.gl_hardlimit);
+ __swab64s(&desc->lquota_desc.gl_softlimit);
+ __swab64s(&desc->lquota_desc.gl_time);
+ CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0);
+}
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+ __swab64s(&lvb->lvb_size);
+ __swab64s(&lvb->lvb_mtime);
+ __swab64s(&lvb->lvb_atime);
+ __swab64s(&lvb->lvb_ctime);
+ __swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+ __swab64s(&lvb->lvb_size);
+ __swab64s(&lvb->lvb_mtime);
+ __swab64s(&lvb->lvb_atime);
+ __swab64s(&lvb->lvb_ctime);
+ __swab64s(&lvb->lvb_blocks);
+ __swab32s(&lvb->lvb_mtime_ns);
+ __swab32s(&lvb->lvb_atime_ns);
+ __swab32s(&lvb->lvb_ctime_ns);
+ __swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+ __swab64s(&lvb->lvb_flags);
+ __swab64s(&lvb->lvb_id_may_rel);
+ __swab64s(&lvb->lvb_id_rel);
+ __swab64s(&lvb->lvb_id_qunit);
+ __swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_mdt_body(struct mdt_body *b)
+{
+ lustre_swab_lu_fid(&b->fid1);
+ lustre_swab_lu_fid(&b->fid2);
+ /* handle is opaque */
+ __swab64s(&b->valid);
+ __swab64s(&b->size);
+ __swab64s(&b->mtime);
+ __swab64s(&b->atime);
+ __swab64s(&b->ctime);
+ __swab64s(&b->blocks);
+ __swab64s(&b->ioepoch);
+ __swab64s(&b->t_state);
+ __swab32s(&b->fsuid);
+ __swab32s(&b->fsgid);
+ __swab32s(&b->capability);
+ __swab32s(&b->mode);
+ __swab32s(&b->uid);
+ __swab32s(&b->gid);
+ __swab32s(&b->flags);
+ __swab32s(&b->rdev);
+ __swab32s(&b->nlink);
+ CLASSERT(offsetof(typeof(*b), unused2) != 0);
+ __swab32s(&b->suppgid);
+ __swab32s(&b->eadatasize);
+ __swab32s(&b->aclsize);
+ __swab32s(&b->max_mdsize);
+ __swab32s(&b->max_cookiesize);
+ __swab32s(&b->uid_h);
+ __swab32s(&b->gid_h);
+ CLASSERT(offsetof(typeof(*b), padding_5) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_body);
+
+void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b)
+{
+ /* handle is opaque */
+ __swab64s(&b->ioepoch);
+ __swab32s(&b->flags);
+ CLASSERT(offsetof(typeof(*b), padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_ioepoch);
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+ int i;
+ __swab32s(&mti->mti_lustre_ver);
+ __swab32s(&mti->mti_stripe_index);
+ __swab32s(&mti->mti_config_ver);
+ __swab32s(&mti->mti_flags);
+ __swab32s(&mti->mti_instance);
+ __swab32s(&mti->mti_nid_count);
+ CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+ for (i = 0; i < MTI_NIDS_MAX; i++)
+ __swab64s(&mti->mti_nids[i]);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_target_info);
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+ int i;
+
+ __swab64s(&entry->mne_version);
+ __swab32s(&entry->mne_instance);
+ __swab32s(&entry->mne_index);
+ __swab32s(&entry->mne_length);
+
+ /* mne_nid_(count|type) must be one byte size because we're gonna
+ * access it w/o swapping. */
+ CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+ CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+ /* remove this assertion if ipv6 is supported. */
+ LASSERT(entry->mne_nid_type == 0);
+ for (i = 0; i < entry->mne_nid_count; i++) {
+ CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+ __swab64s(&entry->u.nids[i]);
+ }
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+ __swab64s(&body->mcb_offset);
+ __swab32s(&body->mcb_units);
+ __swab16s(&body->mcb_type);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_body);
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+ __swab64s(&body->mcr_offset);
+ __swab64s(&body->mcr_size);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_res);
+
+static void lustre_swab_obd_dqinfo(struct obd_dqinfo *i)
+{
+ __swab64s(&i->dqi_bgrace);
+ __swab64s(&i->dqi_igrace);
+ __swab32s(&i->dqi_flags);
+ __swab32s(&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk(struct obd_dqblk *b)
+{
+ __swab64s(&b->dqb_ihardlimit);
+ __swab64s(&b->dqb_isoftlimit);
+ __swab64s(&b->dqb_curinodes);
+ __swab64s(&b->dqb_bhardlimit);
+ __swab64s(&b->dqb_bsoftlimit);
+ __swab64s(&b->dqb_curspace);
+ __swab64s(&b->dqb_btime);
+ __swab64s(&b->dqb_itime);
+ __swab32s(&b->dqb_valid);
+ CLASSERT(offsetof(typeof(*b), dqb_padding) != 0);
+}
+
+void lustre_swab_obd_quotactl(struct obd_quotactl *q)
+{
+ __swab32s(&q->qc_cmd);
+ __swab32s(&q->qc_type);
+ __swab32s(&q->qc_id);
+ __swab32s(&q->qc_stat);
+ lustre_swab_obd_dqinfo(&q->qc_dqinfo);
+ lustre_swab_obd_dqblk(&q->qc_dqblk);
+}
+EXPORT_SYMBOL(lustre_swab_obd_quotactl);
+
+void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p)
+{
+ __swab32s(&p->rp_uid);
+ __swab32s(&p->rp_gid);
+ __swab32s(&p->rp_fsuid);
+ __swab32s(&p->rp_fsuid_h);
+ __swab32s(&p->rp_fsgid);
+ __swab32s(&p->rp_fsgid_h);
+ __swab32s(&p->rp_access_perm);
+ __swab32s(&p->rp_padding);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_remote_perm);
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+ lustre_swab_lu_fid(&gf->gf_fid);
+ __swab64s(&gf->gf_recno);
+ __swab32s(&gf->gf_linkno);
+ __swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent)
+{
+ __swab64s(&fm_extent->fe_logical);
+ __swab64s(&fm_extent->fe_physical);
+ __swab64s(&fm_extent->fe_length);
+ __swab32s(&fm_extent->fe_flags);
+ __swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct ll_user_fiemap *fiemap)
+{
+ int i;
+
+ __swab64s(&fiemap->fm_start);
+ __swab64s(&fiemap->fm_length);
+ __swab32s(&fiemap->fm_flags);
+ __swab32s(&fiemap->fm_mapped_extents);
+ __swab32s(&fiemap->fm_extent_count);
+ __swab32s(&fiemap->fm_reserved);
+
+ for (i = 0; i < fiemap->fm_mapped_extents; i++)
+ lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+EXPORT_SYMBOL(lustre_swab_fiemap);
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+ __swab32s(&ii->ii_magic);
+ __swab32s(&ii->ii_flags);
+ __swab16s(&ii->ii_count);
+ __swab32s(&ii->ii_attrs);
+ lustre_swab_lu_fid(&ii->ii_fid);
+ __swab64s(&ii->ii_version);
+ __swab64s(&ii->ii_hash_start);
+ __swab64s(&ii->ii_hash_end);
+ __swab16s(&ii->ii_keysize);
+ __swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+ /* swab header */
+ __swab32s(&lip->lip_magic);
+ __swab16s(&lip->lip_flags);
+ __swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+ __swab32s(&rr->rr_opcode);
+ __swab32s(&rr->rr_cap);
+ __swab32s(&rr->rr_fsuid);
+ /* rr_fsuid_h is unused */
+ __swab32s(&rr->rr_fsgid);
+ /* rr_fsgid_h is unused */
+ __swab32s(&rr->rr_suppgid1);
+ /* rr_suppgid1_h is unused */
+ __swab32s(&rr->rr_suppgid2);
+ /* rr_suppgid2_h is unused */
+ lustre_swab_lu_fid(&rr->rr_fid1);
+ lustre_swab_lu_fid(&rr->rr_fid2);
+ __swab64s(&rr->rr_mtime);
+ __swab64s(&rr->rr_atime);
+ __swab64s(&rr->rr_ctime);
+ __swab64s(&rr->rr_size);
+ __swab64s(&rr->rr_blocks);
+ __swab32s(&rr->rr_bias);
+ __swab32s(&rr->rr_mode);
+ __swab32s(&rr->rr_flags);
+ __swab32s(&rr->rr_flags_h);
+ __swab32s(&rr->rr_umask);
+
+ CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_rec_reint);
+
+void lustre_swab_lov_desc(struct lov_desc *ld)
+{
+ __swab32s(&ld->ld_tgt_count);
+ __swab32s(&ld->ld_active_tgt_count);
+ __swab32s(&ld->ld_default_stripe_count);
+ __swab32s(&ld->ld_pattern);
+ __swab64s(&ld->ld_default_stripe_size);
+ __swab64s(&ld->ld_default_stripe_offset);
+ __swab32s(&ld->ld_qos_maxage);
+ /* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc(struct lmv_desc *ld)
+{
+ __swab32s(&ld->ld_tgt_count);
+ __swab32s(&ld->ld_active_tgt_count);
+ __swab32s(&ld->ld_default_stripe_count);
+ __swab32s(&ld->ld_pattern);
+ __swab64s(&ld->ld_default_hash_size);
+ __swab32s(&ld->ld_qos_maxage);
+ /* uuid endian insensitive */
+}
+
+void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea)
+{
+ __swab32s(&mea->mea_magic);
+ __swab32s(&mea->mea_count);
+ __swab32s(&mea->mea_master);
+ CLASSERT(offsetof(typeof(*mea), mea_padding) != 0);
+}
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+ int i;
+
+ __swab32s(&lum->lum_magic);
+ __swab32s(&lum->lum_stripe_count);
+ __swab32s(&lum->lum_stripe_offset);
+ __swab32s(&lum->lum_hash_type);
+ __swab32s(&lum->lum_type);
+ CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+ CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0);
+ CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0);
+
+ for (i = 0; i < lum->lum_stripe_count; i++) {
+ __swab32s(&lum->lum_objects[i].lum_mds);
+ lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid);
+ }
+
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void print_lum(struct lov_user_md *lum)
+{
+ CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
+ CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic);
+ CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+ CDEBUG(D_OTHER, "\tlmm_object_id: %llu\n", lmm_oi_id(&lum->lmm_oi));
+ CDEBUG(D_OTHER, "\tlmm_object_gr: %llu\n", lmm_oi_seq(&lum->lmm_oi));
+ CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+ CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+ CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+ lum->lmm_stripe_offset);
+}
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+ __swab64s(&oi->oi.oi_id);
+ __swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+ __swab32s(&lum->lmm_magic);
+ __swab32s(&lum->lmm_pattern);
+ lustre_swab_lmm_oi(&lum->lmm_oi);
+ __swab32s(&lum->lmm_stripe_size);
+ __swab16s(&lum->lmm_stripe_count);
+ __swab16s(&lum->lmm_stripe_offset);
+ print_lum(lum);
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+ CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+ lustre_swab_lov_user_md_common(lum);
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+ CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+ lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+ /* lmm_pool_name nothing to do with char */
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+ CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+ __swab32s(&lmm->lmm_magic);
+ __swab32s(&lmm->lmm_pattern);
+ lustre_swab_lmm_oi(&lmm->lmm_oi);
+ __swab32s(&lmm->lmm_stripe_size);
+ __swab16s(&lmm->lmm_stripe_count);
+ __swab16s(&lmm->lmm_layout_gen);
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+ int stripe_count)
+{
+ int i;
+
+ for (i = 0; i < stripe_count; i++) {
+ lustre_swab_ost_id(&(lod[i].l_ost_oi));
+ __swab32s(&(lod[i].l_ost_gen));
+ __swab32s(&(lod[i].l_ost_idx));
+ }
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_ldlm_res_id(struct ldlm_res_id *id)
+{
+ int i;
+
+ for (i = 0; i < RES_NAME_SIZE; i++)
+ __swab64s(&id->name[i]);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
+
+void lustre_swab_ldlm_policy_data(ldlm_wire_policy_data_t *d)
+{
+ /* the lock data is a union and the first two fields are always an
+ * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+ * data the same way. */
+ __swab64s(&d->l_extent.start);
+ __swab64s(&d->l_extent.end);
+ __swab64s(&d->l_extent.gid);
+ __swab64s(&d->l_flock.lfw_owner);
+ __swab32s(&d->l_flock.lfw_pid);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
+
+void lustre_swab_ldlm_intent(struct ldlm_intent *i)
+{
+ __swab64s(&i->opc);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_intent);
+
+void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r)
+{
+ __swab32s(&r->lr_type);
+ CLASSERT(offsetof(typeof(*r), lr_padding) != 0);
+ lustre_swab_ldlm_res_id(&r->lr_name);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc);
+
+void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l)
+{
+ lustre_swab_ldlm_resource_desc(&l->l_resource);
+ __swab32s(&l->l_req_mode);
+ __swab32s(&l->l_granted_mode);
+ lustre_swab_ldlm_policy_data(&l->l_policy_data);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
+
+void lustre_swab_ldlm_request(struct ldlm_request *rq)
+{
+ __swab32s(&rq->lock_flags);
+ lustre_swab_ldlm_lock_desc(&rq->lock_desc);
+ __swab32s(&rq->lock_count);
+ /* lock_handle[] opaque */
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_request);
+
+void lustre_swab_ldlm_reply(struct ldlm_reply *r)
+{
+ __swab32s(&r->lock_flags);
+ CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
+ lustre_swab_ldlm_lock_desc(&r->lock_desc);
+ /* lock_handle opaque */
+ __swab64s(&r->lock_policy_res1);
+ __swab64s(&r->lock_policy_res2);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_reply);
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+ lustre_swab_lu_fid(&b->qb_fid);
+ lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+ __swab32s(&b->qb_flags);
+ __swab64s(&b->qb_count);
+ __swab64s(&b->qb_usage);
+ __swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+ CDEBUG(D_RPCTRACE,
+ "obd_ioobj: ioo_oid=" DOSTID ", ioo_max_brw=%#x, ioo_bufct=%d\n",
+ POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+ ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(dump_ioo);
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+ CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n",
+ nb->offset, nb->len, nb->flags);
+}
+EXPORT_SYMBOL(dump_rniobuf);
+
+void dump_obdo(struct obdo *oa)
+{
+ __u32 valid = oa->o_valid;
+
+ CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid);
+ if (valid & OBD_MD_FLID)
+ CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+ if (valid & OBD_MD_FLFID)
+ CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n",
+ oa->o_parent_seq);
+ if (valid & OBD_MD_FLSIZE)
+ CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size);
+ if (valid & OBD_MD_FLMTIME)
+ CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime);
+ if (valid & OBD_MD_FLATIME)
+ CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime);
+ if (valid & OBD_MD_FLCTIME)
+ CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime);
+ if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+ CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks);
+ if (valid & OBD_MD_FLGRANT)
+ CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant);
+ if (valid & OBD_MD_FLBLKSZ)
+ CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+ if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+ CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+ oa->o_mode & ((valid & OBD_MD_FLTYPE ? S_IFMT : 0) |
+ (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+ if (valid & OBD_MD_FLUID)
+ CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+ if (valid & OBD_MD_FLUID)
+ CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+ if (valid & OBD_MD_FLGID)
+ CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+ if (valid & OBD_MD_FLGID)
+ CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+ if (valid & OBD_MD_FLFLAGS)
+ CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+ if (valid & OBD_MD_FLNLINK)
+ CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+ else if (valid & OBD_MD_FLCKSUM)
+ CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+ oa->o_nlink);
+ if (valid & OBD_MD_FLGENER)
+ CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+ oa->o_parent_oid);
+ if (valid & OBD_MD_FLEPOCH)
+ CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n",
+ oa->o_ioepoch);
+ if (valid & OBD_MD_FLFID) {
+ CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+ oa->o_stripe_idx);
+ CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+ oa->o_parent_ver);
+ }
+ if (valid & OBD_MD_FLHANDLE)
+ CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n",
+ oa->o_handle.cookie);
+ if (valid & OBD_MD_FLCOOKIE)
+ CDEBUG(D_RPCTRACE, "obdo: o_lcookie = (llog_cookie dumping not yet implemented)\n");
+}
+EXPORT_SYMBOL(dump_obdo);
+
+void dump_ost_body(struct ost_body *ob)
+{
+ dump_obdo(&ob->oa);
+}
+EXPORT_SYMBOL(dump_ost_body);
+
+void dump_rcs(__u32 *rc)
+{
+ CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+EXPORT_SYMBOL(dump_rcs);
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_reqmsg);
+
+ switch (req->rq_reqmsg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+ default:
+ CERROR("bad lustre msg magic: %#08X\n",
+ req->rq_reqmsg->lm_magic);
+ }
+ return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_repmsg);
+
+ switch (req->rq_repmsg->lm_magic) {
+ case LUSTRE_MSG_MAGIC_V2:
+ return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
+ default:
+ /* uninitialized yet */
+ return 0;
+ }
+}
+
+void _debug_req(struct ptlrpc_request *req,
+ struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ...)
+{
+ int req_ok = req->rq_reqmsg != NULL;
+ int rep_ok = req->rq_repmsg != NULL;
+ lnet_nid_t nid = LNET_NID_ANY;
+ va_list args;
+
+ if (ptlrpc_req_need_swab(req)) {
+ req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+ rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+ }
+
+ if (req->rq_import && req->rq_import->imp_connection)
+ nid = req->rq_import->imp_connection->c_peer.nid;
+ else if (req->rq_export && req->rq_export->exp_connection)
+ nid = req->rq_export->exp_connection->c_peer.nid;
+
+ va_start(args, fmt);
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %d dl " CFS_TIME_T " ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d\n",
+ req, req->rq_xid, req->rq_transno,
+ req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+ req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+ req->rq_import ?
+ req->rq_import->imp_obd->obd_name :
+ req->rq_export ?
+ req->rq_export->exp_client_uuid.uuid :
+ "<?>",
+ libcfs_nid2str(nid),
+ req->rq_request_portal, req->rq_reply_portal,
+ req->rq_reqlen, req->rq_replen,
+ req->rq_early_count, req->rq_timedout,
+ req->rq_deadline,
+ atomic_read(&req->rq_refcount),
+ DEBUG_REQ_FLAGS(req),
+ req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+ rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1,
+ req->rq_status,
+ rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1);
+ va_end(args);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_lustre_capa(struct lustre_capa *c)
+{
+ lustre_swab_lu_fid(&c->lc_fid);
+ __swab64s(&c->lc_opc);
+ __swab64s(&c->lc_uid);
+ __swab64s(&c->lc_gid);
+ __swab32s(&c->lc_flags);
+ __swab32s(&c->lc_keyid);
+ __swab32s(&c->lc_timeout);
+ __swab32s(&c->lc_expiry);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa);
+
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k)
+{
+ __swab64s(&k->lk_seq);
+ __swab32s(&k->lk_keyid);
+ CLASSERT(offsetof(typeof(*k), lk_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa_key);
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+ __swab32s(&state->hus_states);
+ __swab32s(&state->hus_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_state);
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+ __swab32s(&hss->hss_valid);
+ __swab64s(&hss->hss_setmask);
+ __swab64s(&hss->hss_clearmask);
+ __swab32s(&hss->hss_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_state_set);
+
+void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+ __swab64s(&extent->offset);
+ __swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+ __swab32s(&action->hca_state);
+ __swab32s(&action->hca_action);
+ lustre_swab_hsm_extent(&action->hca_location);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_current_action);
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+ lustre_swab_lu_fid(&hui->hui_fid);
+ lustre_swab_hsm_extent(&hui->hui_extent);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_item);
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+ __swab32s(&li->li_opc);
+ __swab32s(&li->li_flags);
+ __swab64s(&li->li_start);
+ __swab64s(&li->li_end);
+}
+EXPORT_SYMBOL(lustre_swab_layout_intent);
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+ lustre_swab_lu_fid(&hpk->hpk_fid);
+ __swab64s(&hpk->hpk_cookie);
+ __swab64s(&hpk->hpk_extent.offset);
+ __swab64s(&hpk->hpk_extent.length);
+ __swab16s(&hpk->hpk_flags);
+ __swab16s(&hpk->hpk_errval);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel);
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+ __swab32s(&hr->hr_action);
+ __swab32s(&hr->hr_archive_id);
+ __swab64s(&hr->hr_flags);
+ __swab32s(&hr->hr_itemcount);
+ __swab32s(&hr->hr_data_len);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_request);
+
+void lustre_swab_update_buf(struct update_buf *ub)
+{
+ __swab32s(&ub->ub_magic);
+ __swab32s(&ub->ub_count);
+}
+EXPORT_SYMBOL(lustre_swab_update_buf);
+
+void lustre_swab_update_reply_buf(struct update_reply *ur)
+{
+ int i;
+
+ __swab32s(&ur->ur_version);
+ __swab32s(&ur->ur_count);
+ for (i = 0; i < ur->ur_count; i++)
+ __swab32s(&ur->ur_lens[i]);
+}
+EXPORT_SYMBOL(lustre_swab_update_reply_buf);
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+ __swab64s(&msl->msl_flags);
+}
+EXPORT_SYMBOL(lustre_swab_swap_layouts);
+
+void lustre_swab_close_data(struct close_data *cd)
+{
+ lustre_swab_lu_fid(&cd->cd_fid);
+ __swab64s(&cd->cd_data_version);
+}
+EXPORT_SYMBOL(lustre_swab_close_data);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c
new file mode 100644
index 000000000..e1334c24e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pers.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_import.h"
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+ int mdidx)
+{
+ CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
+
+ LASSERT(mdidx < desc->bd_md_max_brw);
+ LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+ LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
+ LNET_MD_PHYS)));
+
+ md->options |= LNET_MD_KIOV;
+ md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+ md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+ if (desc->bd_enc_iov)
+ md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV];
+ else
+ md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV];
+}
+
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+ int pageoffset, int len)
+{
+ lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count];
+
+ kiov->kiov_page = page;
+ kiov->kiov_offset = pageoffset;
+ kiov->kiov_len = len;
+
+ desc->bd_iov_count++;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
new file mode 100644
index 000000000..9dbda9332
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
@@ -0,0 +1,678 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+module_param(suppress_pings, int, 0644);
+MODULE_PARM_DESC(suppress_pings, "Suppress pings");
+
+struct mutex pinger_mutex;
+static LIST_HEAD(pinger_imports);
+static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list);
+
+int ptlrpc_pinger_suppress_pings(void)
+{
+ return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+ struct ptlrpc_request *req;
+
+ req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+ LUSTRE_OBD_VERSION, OBD_PING);
+ if (req) {
+ ptlrpc_request_set_replen(req);
+ req->rq_no_resend = req->rq_no_delay = 1;
+ }
+ return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+ int rc;
+ struct ptlrpc_request *req;
+
+ req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+ if (req == NULL)
+ return -ENOMEM;
+
+ req->rq_send_state = LUSTRE_IMP_FULL;
+
+ rc = ptlrpc_queue_wait(req);
+
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+int ptlrpc_ping(struct obd_import *imp)
+{
+ struct ptlrpc_request *req;
+
+ req = ptlrpc_prep_ping(imp);
+ if (req == NULL) {
+ CERROR("OOM trying to ping %s->%s\n",
+ imp->imp_obd->obd_uuid.uuid,
+ obd2cli_tgt(imp->imp_obd));
+ return -ENOMEM;
+ }
+
+ DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+ ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+ return 0;
+}
+
+void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+ int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+ if (imp->imp_state == LUSTRE_IMP_DISCON) {
+ int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+ AT_OFF ? 0 :
+ at_get(&imp->imp_at.iat_net_latency));
+ time = min(time, dtime);
+ }
+ imp->imp_next_ping = cfs_time_shift(time);
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+ imp->imp_next_ping = cfs_time_current();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+ return (imp->imp_deactive ||
+ OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+{
+ if (imp->imp_server_timeout)
+ return cfs_time_shift(obd_timeout / 2);
+ else
+ return cfs_time_shift(obd_timeout);
+}
+
+long pinger_check_timeout(unsigned long time)
+{
+ struct timeout_item *item;
+ unsigned long timeout = PING_INTERVAL;
+
+ /* The timeout list is a increase order sorted list */
+ mutex_lock(&pinger_mutex);
+ list_for_each_entry(item, &timeout_list, ti_chain) {
+ int ti_timeout = item->ti_timeout;
+ if (timeout > ti_timeout)
+ timeout = ti_timeout;
+ break;
+ }
+ mutex_unlock(&pinger_mutex);
+
+ return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+ cfs_time_current());
+}
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+ CDEBUG(D_HA, "IR up\n");
+ ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+ CDEBUG(D_HA, "IR down\n");
+ ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+ unsigned long this_ping)
+{
+ int level;
+ int force;
+ int force_next;
+ int suppress;
+
+ spin_lock(&imp->imp_lock);
+
+ level = imp->imp_state;
+ force = imp->imp_force_verify;
+ force_next = imp->imp_force_next_verify;
+ /*
+ * This will be used below only if the import is "FULL".
+ */
+ suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+ imp->imp_force_verify = 0;
+
+ if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+ !force) {
+ spin_unlock(&imp->imp_lock);
+ return;
+ }
+
+ imp->imp_force_next_verify = 0;
+
+ spin_unlock(&imp->imp_lock);
+
+ CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u force %u force_next %u deactive %u pingable %u suppress %u\n",
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+ ptlrpc_import_state_name(level), level, force, force_next,
+ imp->imp_deactive, imp->imp_pingable, suppress);
+
+ if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+ /* wait for a while before trying recovery again */
+ imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+ if (!imp->imp_no_pinger_recover)
+ ptlrpc_initiate_recovery(imp);
+ } else if (level != LUSTRE_IMP_FULL ||
+ imp->imp_obd->obd_no_recov ||
+ imp_is_deactive(imp)) {
+ CDEBUG(D_HA, "%s->%s: not pinging (in recovery or recovery disabled: %s)\n",
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+ ptlrpc_import_state_name(level));
+ if (force) {
+ spin_lock(&imp->imp_lock);
+ imp->imp_force_verify = 1;
+ spin_unlock(&imp->imp_lock);
+ }
+ } else if ((imp->imp_pingable && !suppress) || force_next || force) {
+ ptlrpc_ping(imp);
+ }
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+ struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+
+ /* Record that the thread is running */
+ thread_set_flags(thread, SVC_RUNNING);
+ wake_up(&thread->t_ctl_waitq);
+
+ /* And now, loop forever, pinging as needed. */
+ while (1) {
+ unsigned long this_ping = cfs_time_current();
+ struct l_wait_info lwi;
+ long time_to_next_wake;
+ struct timeout_item *item;
+ struct list_head *iter;
+
+ mutex_lock(&pinger_mutex);
+ list_for_each_entry(item, &timeout_list, ti_chain) {
+ item->ti_cb(item, item->ti_cb_data);
+ }
+ list_for_each(iter, &pinger_imports) {
+ struct obd_import *imp =
+ list_entry(iter, struct obd_import,
+ imp_pinger_chain);
+
+ ptlrpc_pinger_process_import(imp, this_ping);
+ /* obd_timeout might have changed */
+ if (imp->imp_pingable && imp->imp_next_ping &&
+ cfs_time_after(imp->imp_next_ping,
+ cfs_time_add(this_ping,
+ cfs_time_seconds(PING_INTERVAL))))
+ ptlrpc_update_next_ping(imp, 0);
+ }
+ mutex_unlock(&pinger_mutex);
+ /* update memory usage info */
+ obd_update_maxusage();
+
+ /* Wait until the next ping time, or until we're stopped. */
+ time_to_next_wake = pinger_check_timeout(this_ping);
+ /* The ping sent by ptlrpc_send_rpc may get sent out
+ say .01 second after this.
+ ptlrpc_pinger_sending_on_import will then set the
+ next ping time to next_ping + .01 sec, which means
+ we will SKIP the next ping at next_ping, and the
+ ping will get sent 2 timeouts from now! Beware. */
+ CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("
+ CFS_TIME_T")\n", time_to_next_wake,
+ cfs_time_add(this_ping,
+ cfs_time_seconds(PING_INTERVAL)));
+ if (time_to_next_wake > 0) {
+ lwi = LWI_TIMEOUT(max_t(long, time_to_next_wake,
+ cfs_time_seconds(1)),
+ NULL, NULL);
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_stopping(thread) ||
+ thread_is_event(thread),
+ &lwi);
+ if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+ break;
+ } else {
+ /* woken after adding import to reset timer */
+ thread_test_and_clear_flags(thread, SVC_EVENT);
+ }
+ }
+ }
+
+ thread_set_flags(thread, SVC_STOPPED);
+ wake_up(&thread->t_ctl_waitq);
+
+ CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+ return 0;
+}
+
+static struct ptlrpc_thread pinger_thread;
+
+int ptlrpc_start_pinger(void)
+{
+ struct l_wait_info lwi = { 0 };
+ int rc;
+
+ if (!thread_is_init(&pinger_thread) &&
+ !thread_is_stopped(&pinger_thread))
+ return -EALREADY;
+
+ init_waitqueue_head(&pinger_thread.t_ctl_waitq);
+
+ strcpy(pinger_thread.t_name, "ll_ping");
+
+ /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+ * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */
+ rc = PTR_ERR(kthread_run(ptlrpc_pinger_main, &pinger_thread,
+ "%s", pinger_thread.t_name));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("cannot start thread: %d\n", rc);
+ return rc;
+ }
+ l_wait_event(pinger_thread.t_ctl_waitq,
+ thread_is_running(&pinger_thread), &lwi);
+
+ if (suppress_pings)
+ CWARN("Pings will be suppressed at the request of the administrator. The configuration shall meet the additional requirements described in the manual. (Search for the \"suppress_pings\" kernel module parameter.)\n");
+
+ return 0;
+}
+
+int ptlrpc_pinger_remove_timeouts(void);
+
+int ptlrpc_stop_pinger(void)
+{
+ struct l_wait_info lwi = { 0 };
+ int rc = 0;
+
+ if (thread_is_init(&pinger_thread) ||
+ thread_is_stopped(&pinger_thread))
+ return -EALREADY;
+
+ ptlrpc_pinger_remove_timeouts();
+ thread_set_flags(&pinger_thread, SVC_STOPPING);
+ wake_up(&pinger_thread.t_ctl_waitq);
+
+ l_wait_event(pinger_thread.t_ctl_waitq,
+ thread_is_stopped(&pinger_thread), &lwi);
+
+ return rc;
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+ ptlrpc_update_next_ping(imp, 0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import);
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+ ptlrpc_update_next_ping(imp, 1);
+ assert_spin_locked(&imp->imp_lock);
+ /*
+ * Avoid reading stale imp_connect_data. When not sure if pings are
+ * expected or not on next connection, we assume they are not and force
+ * one anyway to guarantee the chance of updating
+ * imp_peer_committed_transno.
+ */
+ if (imp->imp_state != LUSTRE_IMP_FULL ||
+ OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+ imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+ if (!list_empty(&imp->imp_pinger_chain))
+ return -EALREADY;
+
+ mutex_lock(&pinger_mutex);
+ CDEBUG(D_HA, "adding pingable import %s->%s\n",
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+ /* if we add to pinger we want recovery on this import */
+ imp->imp_obd->obd_no_recov = 0;
+ ptlrpc_update_next_ping(imp, 0);
+ /* XXX sort, blah blah */
+ list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+ class_import_get(imp);
+
+ ptlrpc_pinger_wake_up();
+ mutex_unlock(&pinger_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+ if (list_empty(&imp->imp_pinger_chain))
+ return -ENOENT;
+
+ mutex_lock(&pinger_mutex);
+ list_del_init(&imp->imp_pinger_chain);
+ CDEBUG(D_HA, "removing pingable import %s->%s\n",
+ imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+ /* if we remove from pinger we don't want recovery on this import */
+ imp->imp_obd->obd_no_recov = 1;
+ class_import_put(imp);
+ mutex_unlock(&pinger_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+struct timeout_item *ptlrpc_new_timeout(int time, enum timeout_event event,
+ timeout_cb_t cb, void *data)
+{
+ struct timeout_item *ti;
+
+ OBD_ALLOC_PTR(ti);
+ if (!ti)
+ return NULL;
+
+ INIT_LIST_HEAD(&ti->ti_obd_list);
+ INIT_LIST_HEAD(&ti->ti_chain);
+ ti->ti_timeout = time;
+ ti->ti_event = event;
+ ti->ti_cb = cb;
+ ti->ti_cb_data = data;
+
+ return ti;
+}
+
+/**
+ * Register timeout event on the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+ timeout_cb_t cb, void *data)
+{
+ struct timeout_item *item, *tmp;
+
+ LASSERT(mutex_is_locked(&pinger_mutex));
+
+ list_for_each_entry(item, &timeout_list, ti_chain)
+ if (item->ti_event == event)
+ goto out;
+
+ item = ptlrpc_new_timeout(time, event, cb, data);
+ if (item) {
+ list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+ if (tmp->ti_timeout < time) {
+ list_add(&item->ti_chain, &tmp->ti_chain);
+ goto out;
+ }
+ }
+ list_add(&item->ti_chain, &timeout_list);
+ }
+out:
+ return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+ timeout_cb_t cb, void *data,
+ struct list_head *obd_list)
+{
+ struct timeout_item *ti;
+
+ mutex_lock(&pinger_mutex);
+ ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+ if (!ti) {
+ mutex_unlock(&pinger_mutex);
+ return -EINVAL;
+ }
+ list_add(obd_list, &ti->ti_obd_list);
+ mutex_unlock(&pinger_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+ enum timeout_event event)
+{
+ struct timeout_item *ti = NULL, *item;
+
+ if (list_empty(obd_list))
+ return 0;
+ mutex_lock(&pinger_mutex);
+ list_del_init(obd_list);
+ /**
+ * If there are no obd attached to the timeout event
+ * list, remove this timeout event from the pinger
+ */
+ list_for_each_entry(item, &timeout_list, ti_chain) {
+ if (item->ti_event == event) {
+ ti = item;
+ break;
+ }
+ }
+ LASSERTF(ti != NULL, "ti is NULL !\n");
+ if (list_empty(&ti->ti_obd_list)) {
+ list_del(&ti->ti_chain);
+ OBD_FREE_PTR(ti);
+ }
+ mutex_unlock(&pinger_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+ struct timeout_item *item, *tmp;
+
+ mutex_lock(&pinger_mutex);
+ list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+ LASSERT(list_empty(&item->ti_obd_list));
+ list_del(&item->ti_chain);
+ OBD_FREE_PTR(item);
+ }
+ mutex_unlock(&pinger_mutex);
+ return 0;
+}
+
+void ptlrpc_pinger_wake_up(void)
+{
+ thread_add_flags(&pinger_thread, SVC_EVENT);
+ wake_up(&pinger_thread.t_ctl_waitq);
+}
+
+/* Ping evictor thread */
+#define PET_READY 1
+#define PET_TERMINATE 2
+
+static int pet_refcount;
+static int pet_state;
+static wait_queue_head_t pet_waitq;
+LIST_HEAD(pet_list);
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+ struct obd_device *obd;
+
+ spin_lock(&pet_lock);
+ if (pet_state != PET_READY) {
+ /* eventually the new obd will call here again. */
+ spin_unlock(&pet_lock);
+ return 1;
+ }
+
+ obd = class_exp2obd(exp);
+ if (list_empty(&obd->obd_evict_list)) {
+ class_incref(obd, "evictor", obd);
+ list_add(&obd->obd_evict_list, &pet_list);
+ }
+ spin_unlock(&pet_lock);
+
+ wake_up(&pet_waitq);
+ return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+ struct obd_device *obd;
+ struct obd_export *exp;
+ struct l_wait_info lwi = { 0 };
+ time_t expire_time;
+
+ unshare_fs_struct();
+
+ CDEBUG(D_HA, "Starting Ping Evictor\n");
+ pet_state = PET_READY;
+ while (1) {
+ l_wait_event(pet_waitq, (!list_empty(&pet_list)) ||
+ (pet_state == PET_TERMINATE), &lwi);
+
+ /* loop until all obd's will be removed */
+ if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+ break;
+
+ /* we only get here if pet_exp != NULL, and the end of this
+ * loop is the only place which sets it NULL again, so lock
+ * is not strictly necessary. */
+ spin_lock(&pet_lock);
+ obd = list_entry(pet_list.next, struct obd_device,
+ obd_evict_list);
+ spin_unlock(&pet_lock);
+
+ expire_time = get_seconds() - PING_EVICT_TIMEOUT;
+
+ CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+ obd->obd_name, expire_time);
+
+ /* Exports can't be deleted out of the list while we hold
+ * the obd lock (class_unlink_export), which means we can't
+ * lose the last ref on the export. If they've already been
+ * removed from the list, we won't find them here. */
+ spin_lock(&obd->obd_dev_lock);
+ while (!list_empty(&obd->obd_exports_timed)) {
+ exp = list_entry(obd->obd_exports_timed.next,
+ struct obd_export,
+ exp_obd_chain_timed);
+ if (expire_time > exp->exp_last_request_time) {
+ class_export_get(exp);
+ spin_unlock(&obd->obd_dev_lock);
+ LCONSOLE_WARN("%s: haven't heard from client %s (at %s) in %ld seconds. I think it's dead, and I am evicting it. exp %p, cur %ld expire %ld last %ld\n",
+ obd->obd_name,
+ obd_uuid2str(&exp->exp_client_uuid),
+ obd_export_nid2str(exp),
+ (long)(get_seconds() -
+ exp->exp_last_request_time),
+ exp, (long)get_seconds(),
+ (long)expire_time,
+ (long)exp->exp_last_request_time);
+ CDEBUG(D_HA, "Last request was at %ld\n",
+ exp->exp_last_request_time);
+ class_fail_export(exp);
+ class_export_put(exp);
+ spin_lock(&obd->obd_dev_lock);
+ } else {
+ /* List is sorted, so everyone below is ok */
+ break;
+ }
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ spin_lock(&pet_lock);
+ list_del_init(&obd->obd_evict_list);
+ spin_unlock(&pet_lock);
+
+ class_decref(obd, "evictor", obd);
+ }
+ CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+ return 0;
+}
+
+void ping_evictor_start(void)
+{
+ struct task_struct *task;
+
+ if (++pet_refcount > 1)
+ return;
+
+ init_waitqueue_head(&pet_waitq);
+
+ task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+ if (IS_ERR(task)) {
+ pet_refcount--;
+ CERROR("Cannot start ping evictor thread: %ld\n",
+ PTR_ERR(task));
+ }
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+ if (--pet_refcount > 0)
+ return;
+
+ pet_state = PET_TERMINATE;
+ wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644
index 000000000..a66dc3c6d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
@@ -0,0 +1,312 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct mutex ptlrpc_all_services_mutex;
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+/* ptlrpcd.c */
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc);
+
+/* client.c */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+ unsigned type, unsigned portal);
+int ptlrpc_request_cache_init(void);
+void ptlrpc_request_cache_fini(void);
+struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags);
+void ptlrpc_request_cache_free(struct ptlrpc_request *req);
+void ptlrpc_init_xid(void);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+#if defined(CONFIG_PROC_FS)
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+ struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat(struct ptlrpc_request *req,
+ long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_register_service(params...) do {} while (0)
+#define ptlrpc_lprocfs_unregister_service(params...) do {} while (0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do {} while (0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do {} while (0)
+#endif /* CONFIG_PROC_FS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+ /**
+ * Protects nrs_core::nrs_policies, serializes external policy
+ * registration/unregistration, and NRS core lprocfs operations.
+ */
+ struct mutex nrs_mutex;
+ /* XXX: This is just for liblustre. Remove the #if defined directive
+ * when the * "cfs_" prefix is dropped from cfs_list_head. */
+ /**
+ * List of all policy descriptors registered with NRS core; protected
+ * by nrs_core::nrs_mutex.
+ */
+ struct list_head nrs_policies;
+
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+ bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+ bool force)
+{
+ return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+ return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+ enum ptlrpc_nrs_queue_type queue, char *name,
+ enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+ return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+ /**
+ * If the first service partition has an HP NRS head, all service
+ * partitions will.
+ */
+ return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+ LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+ return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+ return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+ return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+ return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+ return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+ LASSERT(nrq->nr_initialized);
+ LASSERT(!nrq->nr_finalized);
+
+ return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+ return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG "reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP "hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX 65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " " \
+ NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+ int mdcnt);
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+ int pageoffset, int len);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v);
+
+/* sec_lproc.c */
+#if defined(CONFIG_PROC_FS)
+int sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+#else
+static inline int sptlrpc_lproc_init(void)
+{ return 0; }
+static inline void sptlrpc_lproc_fini(void) {}
+#endif
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+ enum lustre_sec_part to,
+ struct obd_uuid *target,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *sf);
+int sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+static inline int ll_rpc_recoverable_error(int rc)
+{
+ return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+static inline int tgt_mod_init(void)
+{
+ return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+ return;
+}
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+ if (atomic_dec_and_test(&set->set_refcount))
+ OBD_FREE_PTR(set);
+}
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644
index 000000000..5268887ca
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
@@ -0,0 +1,171 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_req_layout.h"
+
+#include "ptlrpc_internal.h"
+
+extern spinlock_t ptlrpc_last_xid_lock;
+#if RS_DEBUG
+extern spinlock_t ptlrpc_rs_debug_lock;
+#endif
+extern struct mutex pinger_mutex;
+extern struct mutex ptlrpcd_mutex;
+
+__init int ptlrpc_init(void)
+{
+ int rc, cleanup_phase = 0;
+
+ lustre_assert_wire_constants();
+#if RS_DEBUG
+ spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+ mutex_init(&ptlrpc_all_services_mutex);
+ mutex_init(&pinger_mutex);
+ mutex_init(&ptlrpcd_mutex);
+ ptlrpc_init_xid();
+
+ rc = req_layout_init();
+ if (rc)
+ return rc;
+
+ rc = ptlrpc_hr_init();
+ if (rc)
+ return rc;
+
+ cleanup_phase = 1;
+ rc = ptlrpc_request_cache_init();
+ if (rc)
+ goto cleanup;
+
+ cleanup_phase = 2;
+ rc = ptlrpc_init_portals();
+ if (rc)
+ goto cleanup;
+
+ cleanup_phase = 3;
+
+ rc = ptlrpc_connection_init();
+ if (rc)
+ goto cleanup;
+
+ cleanup_phase = 4;
+ ptlrpc_put_connection_superhack = ptlrpc_connection_put;
+
+ rc = ptlrpc_start_pinger();
+ if (rc)
+ goto cleanup;
+
+ cleanup_phase = 5;
+ rc = ldlm_init();
+ if (rc)
+ goto cleanup;
+
+ cleanup_phase = 6;
+ rc = sptlrpc_init();
+ if (rc)
+ goto cleanup;
+
+ cleanup_phase = 7;
+ rc = ptlrpc_nrs_init();
+ if (rc)
+ goto cleanup;
+
+ cleanup_phase = 8;
+ rc = tgt_mod_init();
+ if (rc)
+ goto cleanup;
+ return 0;
+
+cleanup:
+ switch (cleanup_phase) {
+ case 8:
+ ptlrpc_nrs_fini();
+ /* Fall through */
+ case 7:
+ sptlrpc_fini();
+ /* Fall through */
+ case 6:
+ ldlm_exit();
+ /* Fall through */
+ case 5:
+ ptlrpc_stop_pinger();
+ /* Fall through */
+ case 4:
+ ptlrpc_connection_fini();
+ /* Fall through */
+ case 3:
+ ptlrpc_exit_portals();
+ /* Fall through */
+ case 2:
+ ptlrpc_request_cache_fini();
+ /* Fall through */
+ case 1:
+ ptlrpc_hr_fini();
+ req_layout_fini();
+ /* Fall through */
+ default: ;
+ }
+
+ return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+ tgt_mod_exit();
+ ptlrpc_nrs_fini();
+ sptlrpc_fini();
+ ldlm_exit();
+ ptlrpc_stop_pinger();
+ ptlrpc_exit_portals();
+ ptlrpc_request_cache_fini();
+ ptlrpc_hr_fini();
+ ptlrpc_connection_fini();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
+
+module_init(ptlrpc_init);
+module_exit(ptlrpc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
new file mode 100644
index 000000000..0c178ec0e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
@@ -0,0 +1,811 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/lustre_net.h"
+#include "../include/lustre_lib.h"
+#include "../include/lustre_ha.h"
+#include "../include/obd_class.h" /* for obd_zombie */
+#include "../include/obd_support.h" /* for OBD_FAIL_CHECK */
+#include "../include/cl_object.h" /* cl_env_{get,put}() */
+#include "../include/lprocfs_status.h"
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpcd {
+ int pd_size;
+ int pd_index;
+ int pd_nthreads;
+ struct ptlrpcd_ctl pd_thread_rcv;
+ struct ptlrpcd_ctl pd_threads[0];
+};
+
+static int max_ptlrpcds;
+module_param(max_ptlrpcds, int, 0644);
+MODULE_PARM_DESC(max_ptlrpcds, "Max ptlrpcd thread count to be started.");
+
+static int ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+module_param(ptlrpcd_bind_policy, int, 0644);
+MODULE_PARM_DESC(ptlrpcd_bind_policy, "Ptlrpcd threads binding mode.");
+static struct ptlrpcd *ptlrpcds;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+ struct ptlrpc_request_set *rq_set = req->rq_set;
+
+ LASSERT(rq_set != NULL);
+
+ wake_up(&rq_set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index)
+{
+ int idx = 0;
+
+ if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+ return &ptlrpcds->pd_thread_rcv;
+
+ switch (policy) {
+ case PDL_POLICY_SAME:
+ idx = smp_processor_id() % ptlrpcds->pd_nthreads;
+ break;
+ case PDL_POLICY_LOCAL:
+ /* Before CPU partition patches available, process it the same
+ * as "PDL_POLICY_ROUND". */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix this code to use new CPU partition APIs"
+# endif
+ /* Fall through to PDL_POLICY_ROUND until the CPU
+ * CPU partition patches are available. */
+ index = -1;
+ case PDL_POLICY_PREFERRED:
+ if (index >= 0 && index < num_online_cpus()) {
+ idx = index % ptlrpcds->pd_nthreads;
+ break;
+ }
+ /* Fall through to PDL_POLICY_ROUND for bad index. */
+ default:
+ /* Fall through to PDL_POLICY_ROUND for unknown policy. */
+ case PDL_POLICY_ROUND:
+ /* We do not care whether it is strict load balance. */
+ idx = ptlrpcds->pd_index + 1;
+ if (idx == smp_processor_id())
+ idx++;
+ idx %= ptlrpcds->pd_nthreads;
+ ptlrpcds->pd_index = idx;
+ break;
+ }
+
+ return &ptlrpcds->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+ struct list_head *tmp, *pos;
+ struct ptlrpcd_ctl *pc;
+ struct ptlrpc_request_set *new;
+ int count, i;
+
+ pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1);
+ new = pc->pc_set;
+
+ list_for_each_safe(pos, tmp, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(pos, struct ptlrpc_request,
+ rq_set_chain);
+
+ LASSERT(req->rq_phase == RQ_PHASE_NEW);
+ req->rq_set = new;
+ req->rq_queued_time = cfs_time_current();
+ }
+
+ spin_lock(&new->set_new_req_lock);
+ list_splice_init(&set->set_requests, &new->set_new_requests);
+ i = atomic_read(&set->set_remaining);
+ count = atomic_add_return(i, &new->set_new_count);
+ atomic_set(&set->set_remaining, 0);
+ spin_unlock(&new->set_new_req_lock);
+ if (count == i) {
+ wake_up(&new->set_waitq);
+
+ /* XXX: It maybe unnecessary to wakeup all the partners. But to
+ * guarantee the async RPC can be processed ASAP, we have
+ * no other better choice. It maybe fixed in future. */
+ for (i = 0; i < pc->pc_npartners; i++)
+ wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+ }
+}
+EXPORT_SYMBOL(ptlrpcd_add_rqset);
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+ struct ptlrpc_request_set *src)
+{
+ struct list_head *tmp, *pos;
+ struct ptlrpc_request *req;
+ int rc = 0;
+
+ spin_lock(&src->set_new_req_lock);
+ if (likely(!list_empty(&src->set_new_requests))) {
+ list_for_each_safe(pos, tmp, &src->set_new_requests) {
+ req = list_entry(pos, struct ptlrpc_request,
+ rq_set_chain);
+ req->rq_set = des;
+ }
+ list_splice_init(&src->set_new_requests,
+ &des->set_requests);
+ rc = atomic_read(&src->set_new_count);
+ atomic_add(rc, &des->set_remaining);
+ atomic_set(&src->set_new_count, 0);
+ }
+ spin_unlock(&src->set_new_req_lock);
+ return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx)
+{
+ struct ptlrpcd_ctl *pc;
+
+ if (req->rq_reqmsg)
+ lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+ spin_lock(&req->rq_lock);
+ if (req->rq_invalid_rqset) {
+ struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5),
+ back_to_sleep, NULL);
+
+ req->rq_invalid_rqset = 0;
+ spin_unlock(&req->rq_lock);
+ l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi);
+ } else if (req->rq_set) {
+ /* If we have a valid "rq_set", just reuse it to avoid double
+ * linked. */
+ LASSERT(req->rq_phase == RQ_PHASE_NEW);
+ LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+ /* ptlrpc_check_set will decrease the count */
+ atomic_inc(&req->rq_set->set_remaining);
+ spin_unlock(&req->rq_lock);
+ wake_up(&req->rq_set->set_waitq);
+ return;
+ } else {
+ spin_unlock(&req->rq_lock);
+ }
+
+ pc = ptlrpcd_select_pc(req, policy, idx);
+
+ DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]",
+ req, pc->pc_name, pc->pc_index);
+
+ ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+ atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+ struct list_head *tmp, *pos;
+ struct ptlrpc_request *req;
+ struct ptlrpc_request_set *set = pc->pc_set;
+ int rc = 0;
+ int rc2;
+
+ if (atomic_read(&set->set_new_count)) {
+ spin_lock(&set->set_new_req_lock);
+ if (likely(!list_empty(&set->set_new_requests))) {
+ list_splice_init(&set->set_new_requests,
+ &set->set_requests);
+ atomic_add(atomic_read(&set->set_new_count),
+ &set->set_remaining);
+ atomic_set(&set->set_new_count, 0);
+ /*
+ * Need to calculate its timeout.
+ */
+ rc = 1;
+ }
+ spin_unlock(&set->set_new_req_lock);
+ }
+
+ /* We should call lu_env_refill() before handling new requests to make
+ * sure that env key the requests depending on really exists.
+ */
+ rc2 = lu_env_refill(env);
+ if (rc2 != 0) {
+ /*
+ * XXX This is very awkward situation, because
+ * execution can neither continue (request
+ * interpreters assume that env is set up), nor repeat
+ * the loop (as this potentially results in a tight
+ * loop of -ENOMEM's).
+ *
+ * Fortunately, refill only ever does something when
+ * new modules are loaded, i.e., early during boot up.
+ */
+ CERROR("Failure to refill session: %d\n", rc2);
+ return rc;
+ }
+
+ if (atomic_read(&set->set_remaining))
+ rc |= ptlrpc_check_set(env, set);
+
+ /* NB: ptlrpc_check_set has already moved completed request at the
+ * head of seq::set_requests */
+ list_for_each_safe(pos, tmp, &set->set_requests) {
+ req = list_entry(pos, struct ptlrpc_request, rq_set_chain);
+ if (req->rq_phase != RQ_PHASE_COMPLETE)
+ break;
+
+ list_del_init(&req->rq_set_chain);
+ req->rq_set = NULL;
+ ptlrpc_req_finished(req);
+ }
+
+ if (rc == 0) {
+ /*
+ * If new requests have been added, make sure to wake up.
+ */
+ rc = atomic_read(&set->set_new_count);
+
+ /* If we have nothing to do, check whether we can take some
+ * work from our partner threads. */
+ if (rc == 0 && pc->pc_npartners > 0) {
+ struct ptlrpcd_ctl *partner;
+ struct ptlrpc_request_set *ps;
+ int first = pc->pc_cursor;
+
+ do {
+ partner = pc->pc_partners[pc->pc_cursor++];
+ if (pc->pc_cursor >= pc->pc_npartners)
+ pc->pc_cursor = 0;
+ if (partner == NULL)
+ continue;
+
+ spin_lock(&partner->pc_lock);
+ ps = partner->pc_set;
+ if (ps == NULL) {
+ spin_unlock(&partner->pc_lock);
+ continue;
+ }
+
+ ptlrpc_reqset_get(ps);
+ spin_unlock(&partner->pc_lock);
+
+ if (atomic_read(&ps->set_new_count)) {
+ rc = ptlrpcd_steal_rqset(set, ps);
+ if (rc > 0)
+ CDEBUG(D_RPCTRACE, "transfer %d async RPCs [%d->%d]\n",
+ rc, partner->pc_index,
+ pc->pc_index);
+ }
+ ptlrpc_reqset_put(ps);
+ } while (rc == 0 && pc->pc_cursor != first);
+ }
+ }
+
+ return rc;
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ *
+ */
+static int ptlrpcd(void *arg)
+{
+ struct ptlrpcd_ctl *pc = arg;
+ struct ptlrpc_request_set *set = pc->pc_set;
+ struct lu_env env = { .le_ses = NULL };
+ int rc, exit = 0;
+
+ unshare_fs_struct();
+#if defined(CONFIG_SMP)
+ if (test_bit(LIOD_BIND, &pc->pc_flags)) {
+ int index = pc->pc_index;
+
+ if (index >= 0 && index < num_possible_cpus()) {
+ while (!cpu_online(index)) {
+ if (++index >= num_possible_cpus())
+ index = 0;
+ }
+ set_cpus_allowed_ptr(current,
+ cpumask_of_node(cpu_to_node(index)));
+ }
+ }
+#endif
+ /*
+ * XXX So far only "client" ptlrpcd uses an environment. In
+ * the future, ptlrpcd thread (or a thread-set) has to given
+ * an argument, describing its "scope".
+ */
+ rc = lu_context_init(&env.le_ctx,
+ LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF);
+ complete(&pc->pc_starting);
+
+ if (rc != 0)
+ return rc;
+
+ /*
+ * This mainloop strongly resembles ptlrpc_set_wait() except that our
+ * set never completes. ptlrpcd_check() calls ptlrpc_check_set() when
+ * there are requests in the set. New requests come in on the set's
+ * new_req_list and ptlrpcd_check() moves them into the set.
+ */
+ do {
+ struct l_wait_info lwi;
+ int timeout;
+
+ timeout = ptlrpc_set_next_timeout(set);
+ lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+ ptlrpc_expired_set, set);
+
+ lu_context_enter(&env.le_ctx);
+ l_wait_event(set->set_waitq,
+ ptlrpcd_check(&env, pc), &lwi);
+ lu_context_exit(&env.le_ctx);
+
+ /*
+ * Abort inflight rpcs for forced stop case.
+ */
+ if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+ if (test_bit(LIOD_FORCE, &pc->pc_flags))
+ ptlrpc_abort_set(set);
+ exit++;
+ }
+
+ /*
+ * Let's make one more loop to make sure that ptlrpcd_check()
+ * copied all raced new rpcs into the set so we can kill them.
+ */
+ } while (exit < 2);
+
+ /*
+ * Wait for inflight requests to drain.
+ */
+ if (!list_empty(&set->set_requests))
+ ptlrpc_set_wait(set);
+ lu_context_fini(&env.le_ctx);
+
+ complete(&pc->pc_finishing);
+
+ return 0;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we start many
+ * ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by
+ * data transfer cross-CPU cores. So we bind ptlrpcd thread to specified
+ * CPU core. But binding all ptlrpcd threads maybe cause response delay
+ * because of some CPU core(s) busy with other loads.
+ *
+ * For example: "ls -l", some async RPCs for statahead are assigned to
+ * ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy
+ * with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l"
+ * thread, statahead thread, and ptlrpcd thread can run in parallel), under
+ * such case, the statahead async RPCs can not be processed in time, it is
+ * unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may
+ * be better. But it breaks former data transfer policy.
+ *
+ * So we shouldn't be blind for avoiding the data transfer. We make some
+ * compromise: divide the ptlrpcd threads pool into two parts. One part is
+ * for bound mode, each ptlrpcd thread in this part is bound to some CPU
+ * core. The other part is for free mode, all the ptlrpcd threads in the
+ * part can be scheduled on any CPU core. We specify some partnership
+ * between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s),
+ * and the async RPC load within the partners are shared.
+ *
+ * It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd
+ * thread can be scheduled in time), and try to guarantee the async RPC
+ * processed ASAP (as long as the free mode ptlrpcd thread can be scheduled
+ * on any CPU core).
+ *
+ * As for how to specify the partnership between bound mode ptlrpcd
+ * thread(s) and free mode ptlrpcd thread(s), the simplest way is to use
+ * <free bound> pair. In future, we can specify some more complex
+ * partnership based on the patches for CPU partition. But before such
+ * patches are available, we prefer to use the simplest one.
+ */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix ptlrpcd_bind() to use new CPU partition APIs"
+# endif
+static int ptlrpcd_bind(int index, int max)
+{
+ struct ptlrpcd_ctl *pc;
+ int rc = 0;
+#if defined(CONFIG_NUMA)
+ cpumask_t mask;
+#endif
+
+ LASSERT(index <= max - 1);
+ pc = &ptlrpcds->pd_threads[index];
+ switch (ptlrpcd_bind_policy) {
+ case PDB_POLICY_NONE:
+ pc->pc_npartners = -1;
+ break;
+ case PDB_POLICY_FULL:
+ pc->pc_npartners = 0;
+ set_bit(LIOD_BIND, &pc->pc_flags);
+ break;
+ case PDB_POLICY_PAIR:
+ LASSERT(max % 2 == 0);
+ pc->pc_npartners = 1;
+ break;
+ case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+ {
+ int i;
+ cpumask_copy(&mask, cpumask_of_node(cpu_to_node(index)));
+ for (i = max; i < num_online_cpus(); i++)
+ cpumask_clear_cpu(i, &mask);
+ pc->pc_npartners = cpumask_weight(&mask) - 1;
+ set_bit(LIOD_BIND, &pc->pc_flags);
+ }
+#else
+ LASSERT(max >= 3);
+ pc->pc_npartners = 2;
+#endif
+ break;
+ default:
+ CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy);
+ rc = -EINVAL;
+ }
+
+ if (rc == 0 && pc->pc_npartners > 0) {
+ OBD_ALLOC(pc->pc_partners,
+ sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+ if (pc->pc_partners == NULL) {
+ pc->pc_npartners = 0;
+ rc = -ENOMEM;
+ } else {
+ switch (ptlrpcd_bind_policy) {
+ case PDB_POLICY_PAIR:
+ if (index & 0x1) {
+ set_bit(LIOD_BIND, &pc->pc_flags);
+ pc->pc_partners[0] = &ptlrpcds->
+ pd_threads[index - 1];
+ ptlrpcds->pd_threads[index - 1].
+ pc_partners[0] = pc;
+ }
+ break;
+ case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+ {
+ struct ptlrpcd_ctl *ppc;
+ int i, pidx;
+ /* partners are cores in the same NUMA node.
+ * setup partnership only with ptlrpcd threads
+ * that are already initialized
+ */
+ for (pidx = 0, i = 0; i < index; i++) {
+ if (cpumask_test_cpu(i, &mask)) {
+ ppc = &ptlrpcds->pd_threads[i];
+ pc->pc_partners[pidx++] = ppc;
+ ppc->pc_partners[ppc->
+ pc_npartners++] = pc;
+ }
+ }
+ /* adjust number of partners to the number
+ * of partnership really setup */
+ pc->pc_npartners = pidx;
+ }
+#else
+ if (index & 0x1)
+ set_bit(LIOD_BIND, &pc->pc_flags);
+ if (index > 0) {
+ pc->pc_partners[0] = &ptlrpcds->
+ pd_threads[index - 1];
+ ptlrpcds->pd_threads[index - 1].
+ pc_partners[1] = pc;
+ if (index == max - 1) {
+ pc->pc_partners[1] =
+ &ptlrpcds->pd_threads[0];
+ ptlrpcds->pd_threads[0].
+ pc_partners[0] = pc;
+ }
+ }
+#endif
+ break;
+ }
+ }
+ }
+
+ return rc;
+}
+
+
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc)
+{
+ int rc;
+
+ /*
+ * Do not allow start second thread for one pc.
+ */
+ if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+ CWARN("Starting second thread (%s) for same pc %p\n",
+ name, pc);
+ return 0;
+ }
+
+ pc->pc_index = index;
+ init_completion(&pc->pc_starting);
+ init_completion(&pc->pc_finishing);
+ spin_lock_init(&pc->pc_lock);
+ strlcpy(pc->pc_name, name, sizeof(pc->pc_name));
+ pc->pc_set = ptlrpc_prep_set();
+ if (pc->pc_set == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * So far only "client" ptlrpcd uses an environment. In the future,
+ * ptlrpcd thread (or a thread-set) has to be given an argument,
+ * describing its "scope".
+ */
+ rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER);
+ if (rc != 0)
+ goto out_set;
+
+ {
+ struct task_struct *task;
+ if (index >= 0) {
+ rc = ptlrpcd_bind(index, max);
+ if (rc < 0)
+ goto out_env;
+ }
+
+ task = kthread_run(ptlrpcd, pc, "%s", pc->pc_name);
+ if (IS_ERR(task)) {
+ rc = PTR_ERR(task);
+ goto out_env;
+ }
+
+ wait_for_completion(&pc->pc_starting);
+ }
+ return 0;
+
+out_env:
+ lu_context_fini(&pc->pc_env.le_ctx);
+
+out_set:
+ if (pc->pc_set != NULL) {
+ struct ptlrpc_request_set *set = pc->pc_set;
+
+ spin_lock(&pc->pc_lock);
+ pc->pc_set = NULL;
+ spin_unlock(&pc->pc_lock);
+ ptlrpc_set_destroy(set);
+ }
+ clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+ clear_bit(LIOD_START, &pc->pc_flags);
+ return rc;
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+ if (!test_bit(LIOD_START, &pc->pc_flags)) {
+ CWARN("Thread for pc %p was not started\n", pc);
+ return;
+ }
+
+ set_bit(LIOD_STOP, &pc->pc_flags);
+ if (force)
+ set_bit(LIOD_FORCE, &pc->pc_flags);
+ wake_up(&pc->pc_set->set_waitq);
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+ struct ptlrpc_request_set *set = pc->pc_set;
+
+ if (!test_bit(LIOD_START, &pc->pc_flags)) {
+ CWARN("Thread for pc %p was not started\n", pc);
+ goto out;
+ }
+
+ wait_for_completion(&pc->pc_finishing);
+ lu_context_fini(&pc->pc_env.le_ctx);
+
+ spin_lock(&pc->pc_lock);
+ pc->pc_set = NULL;
+ spin_unlock(&pc->pc_lock);
+ ptlrpc_set_destroy(set);
+
+ clear_bit(LIOD_START, &pc->pc_flags);
+ clear_bit(LIOD_STOP, &pc->pc_flags);
+ clear_bit(LIOD_FORCE, &pc->pc_flags);
+ clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+ if (pc->pc_npartners > 0) {
+ LASSERT(pc->pc_partners != NULL);
+
+ OBD_FREE(pc->pc_partners,
+ sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+ pc->pc_partners = NULL;
+ }
+ pc->pc_npartners = 0;
+}
+
+static void ptlrpcd_fini(void)
+{
+ int i;
+
+ if (ptlrpcds != NULL) {
+ for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+ ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0);
+ for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+ ptlrpcd_free(&ptlrpcds->pd_threads[i]);
+ ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+ ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+ OBD_FREE(ptlrpcds, ptlrpcds->pd_size);
+ ptlrpcds = NULL;
+ }
+}
+
+static int ptlrpcd_init(void)
+{
+ int nthreads = num_online_cpus();
+ char name[16];
+ int size, i = -1, j, rc = 0;
+
+ if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads)
+ nthreads = max_ptlrpcds;
+ if (nthreads < 2)
+ nthreads = 2;
+ if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR)
+ ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+ else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR)
+ nthreads &= ~1; /* make sure it is even */
+
+ size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+ OBD_ALLOC(ptlrpcds, size);
+ if (ptlrpcds == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ snprintf(name, sizeof(name), "ptlrpcd_rcv");
+ set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags);
+ rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv);
+ if (rc < 0)
+ goto out;
+
+ /* XXX: We start nthreads ptlrpc daemons. Each of them can process any
+ * non-recovery async RPC to improve overall async RPC efficiency.
+ *
+ * But there are some issues with async I/O RPCs and async non-I/O
+ * RPCs processed in the same set under some cases. The ptlrpcd may
+ * be blocked by some async I/O RPC(s), then will cause other async
+ * non-I/O RPC(s) can not be processed in time.
+ *
+ * Maybe we should distinguish blocked async RPCs from non-blocked
+ * async RPCs, and process them in different ptlrpcd sets to avoid
+ * unnecessary dependency. But how to distribute async RPCs load
+ * among all the ptlrpc daemons becomes another trouble. */
+ for (i = 0; i < nthreads; i++) {
+ snprintf(name, sizeof(name), "ptlrpcd_%d", i);
+ rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]);
+ if (rc < 0)
+ goto out;
+ }
+
+ ptlrpcds->pd_size = size;
+ ptlrpcds->pd_index = 0;
+ ptlrpcds->pd_nthreads = nthreads;
+
+out:
+ if (rc != 0 && ptlrpcds != NULL) {
+ for (j = 0; j <= i; j++)
+ ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0);
+ for (j = 0; j <= i; j++)
+ ptlrpcd_free(&ptlrpcds->pd_threads[j]);
+ ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+ ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+ OBD_FREE(ptlrpcds, size);
+ ptlrpcds = NULL;
+ }
+
+ return 0;
+}
+
+int ptlrpcd_addref(void)
+{
+ int rc = 0;
+
+ mutex_lock(&ptlrpcd_mutex);
+ if (++ptlrpcd_users == 1)
+ rc = ptlrpcd_init();
+ mutex_unlock(&ptlrpcd_mutex);
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+ mutex_lock(&ptlrpcd_mutex);
+ if (--ptlrpcd_users == 0)
+ ptlrpcd_fini();
+ mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c
new file mode 100644
index 000000000..7b1d72947
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/recover.c
@@ -0,0 +1,379 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/lustre_ha.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_export.h"
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include <linux/list.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+ CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+ ptlrpc_connect_import(imp);
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+ int rc = 0;
+ struct list_head *tmp, *pos;
+ struct ptlrpc_request *req = NULL;
+ __u64 last_transno;
+
+ *inflight = 0;
+
+ /* It might have committed some after we last spoke, so make sure we
+ * get rid of them now.
+ */
+ spin_lock(&imp->imp_lock);
+ imp->imp_last_transno_checked = 0;
+ ptlrpc_free_committed(imp);
+ last_transno = imp->imp_last_replay_transno;
+ spin_unlock(&imp->imp_lock);
+
+ CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n",
+ imp, obd2cli_tgt(imp->imp_obd),
+ imp->imp_peer_committed_transno, last_transno);
+
+ /* Do I need to hold a lock across this iteration? We shouldn't be
+ * racing with any additions to the list, because we're in recovery
+ * and are therefore not processing additional requests to add. Calls
+ * to ptlrpc_free_committed might commit requests, but nothing "newer"
+ * than the one we're replaying (it can't be committed until it's
+ * replayed, and we're doing that here). l_f_e_safe protects against
+ * problems with the current request being committed, in the unlikely
+ * event of that race. So, in conclusion, I think that it's safe to
+ * perform this list-walk without the imp_lock held.
+ *
+ * But, the {mdc,osc}_replay_open callbacks both iterate
+ * request lists, and have comments saying they assume the
+ * imp_lock is being held by ptlrpc_replay, but it's not. it's
+ * just a little race...
+ */
+
+ /* Replay all the committed open requests on committed_list first */
+ if (!list_empty(&imp->imp_committed_list)) {
+ tmp = imp->imp_committed_list.prev;
+ req = list_entry(tmp, struct ptlrpc_request,
+ rq_replay_list);
+
+ /* The last request on committed_list hasn't been replayed */
+ if (req->rq_transno > last_transno) {
+ /* Since the imp_committed_list is immutable before
+ * all of it's requests being replayed, it's safe to
+ * use a cursor to accelerate the search */
+ imp->imp_replay_cursor = imp->imp_replay_cursor->next;
+
+ while (imp->imp_replay_cursor !=
+ &imp->imp_committed_list) {
+ req = list_entry(imp->imp_replay_cursor,
+ struct ptlrpc_request,
+ rq_replay_list);
+ if (req->rq_transno > last_transno)
+ break;
+
+ req = NULL;
+ imp->imp_replay_cursor =
+ imp->imp_replay_cursor->next;
+ }
+ } else {
+ /* All requests on committed_list have been replayed */
+ imp->imp_replay_cursor = &imp->imp_committed_list;
+ req = NULL;
+ }
+ }
+
+ /* All the requests in committed list have been replayed, let's replay
+ * the imp_replay_list */
+ if (req == NULL) {
+ list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+ req = list_entry(tmp, struct ptlrpc_request,
+ rq_replay_list);
+
+ if (req->rq_transno > last_transno)
+ break;
+ req = NULL;
+ }
+ }
+
+ /* If need to resend the last sent transno (because a reconnect
+ * has occurred), then stop on the matching req and send it again.
+ * If, however, the last sent transno has been committed then we
+ * continue replay from the next request. */
+ if (req != NULL && imp->imp_resend_replay)
+ lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+
+ spin_lock(&imp->imp_lock);
+ imp->imp_resend_replay = 0;
+ spin_unlock(&imp->imp_lock);
+
+ if (req != NULL) {
+ rc = ptlrpc_replay_req(req);
+ if (rc) {
+ CERROR("recovery replay error %d for req %llu\n",
+ rc, req->rq_xid);
+ return rc;
+ }
+ *inflight = 1;
+ }
+ return rc;
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+ struct ptlrpc_request *req, *next;
+
+ /* As long as we're in recovery, nothing should be added to the sending
+ * list, so we don't need to hold the lock during this iteration and
+ * resend process.
+ */
+ /* Well... what if lctl recover is called twice at the same time?
+ */
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+ spin_unlock(&imp->imp_lock);
+ return -1;
+ }
+
+ list_for_each_entry_safe(req, next, &imp->imp_sending_list,
+ rq_list) {
+ LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON,
+ "req %p bad\n", req);
+ LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+ if (!ptlrpc_no_resend(req))
+ ptlrpc_resend_req(req);
+ }
+ spin_unlock(&imp->imp_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_resend);
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+ struct list_head *tmp, *pos;
+ struct ptlrpc_request *req;
+
+ spin_lock(&imp->imp_lock);
+ list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+ req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+ DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+ ptlrpc_client_wake_req(req);
+ }
+ spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_wake_delayed);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+ struct obd_import *imp = failed_req->rq_import;
+
+ CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+ imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid);
+
+ if (ptlrpc_set_import_discon(imp,
+ lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+ if (!imp->imp_replayable) {
+ CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n",
+ obd2cli_tgt(imp->imp_obd),
+ imp->imp_connection->c_remote_uuid.uuid,
+ imp->imp_obd->obd_name);
+ ptlrpc_deactivate_import(imp);
+ }
+ /* to control recovery via lctl {disable|enable}_recovery */
+ if (imp->imp_deactive == 0)
+ ptlrpc_connect_import(imp);
+ }
+
+ /* Wait for recovery to complete and resend. If evicted, then
+ this request will be errored out later.*/
+ spin_lock(&failed_req->rq_lock);
+ if (!failed_req->rq_no_resend)
+ failed_req->rq_resend = 1;
+ spin_unlock(&failed_req->rq_lock);
+}
+
+/**
+ * Administratively active/deactive a client.
+ * This should only be called by the ioctl interface, currently
+ * - the lctl deactivate and activate commands
+ * - echo 0/1 >> /proc/osc/XXX/active
+ * - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+ struct obd_device *obd = imp->imp_obd;
+ int rc = 0;
+
+ LASSERT(obd);
+
+ /* When deactivating, mark import invalid, and abort in-flight
+ * requests. */
+ if (!active) {
+ LCONSOLE_WARN("setting import %s INACTIVE by administrator request\n",
+ obd2cli_tgt(imp->imp_obd));
+
+ /* set before invalidate to avoid messages about imp_inval
+ * set without imp_deactive in ptlrpc_import_delay_req */
+ spin_lock(&imp->imp_lock);
+ imp->imp_deactive = 1;
+ spin_unlock(&imp->imp_lock);
+
+ obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+ ptlrpc_invalidate_import(imp);
+ }
+
+ /* When activating, mark import valid, and attempt recovery */
+ if (active) {
+ CDEBUG(D_HA, "setting import %s VALID\n",
+ obd2cli_tgt(imp->imp_obd));
+
+ spin_lock(&imp->imp_lock);
+ imp->imp_deactive = 0;
+ spin_unlock(&imp->imp_lock);
+ obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+ rc = ptlrpc_recover_import(imp, NULL, 0);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+ int rc = 0;
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+ atomic_read(&imp->imp_inval_count))
+ rc = -EINVAL;
+ spin_unlock(&imp->imp_lock);
+ if (rc)
+ goto out;
+
+ /* force import to be disconnected. */
+ ptlrpc_set_import_discon(imp, 0);
+
+ if (new_uuid) {
+ struct obd_uuid uuid;
+
+ /* intruct import to use new uuid */
+ obd_str2uuid(&uuid, new_uuid);
+ rc = import_set_conn_priority(imp, &uuid);
+ if (rc)
+ goto out;
+ }
+
+ /* Check if reconnect is already in progress */
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state != LUSTRE_IMP_DISCON) {
+ imp->imp_force_verify = 1;
+ rc = -EALREADY;
+ }
+ spin_unlock(&imp->imp_lock);
+ if (rc)
+ goto out;
+
+ rc = ptlrpc_connect_import(imp);
+ if (rc)
+ goto out;
+
+ if (!async) {
+ struct l_wait_info lwi;
+ int secs = cfs_time_seconds(obd_timeout);
+
+ CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+ obd2cli_tgt(imp->imp_obd), secs);
+
+ lwi = LWI_TIMEOUT(secs, NULL, NULL);
+ rc = l_wait_event(imp->imp_recovery_waitq,
+ !ptlrpc_import_in_recovery(imp), &lwi);
+ CDEBUG(D_HA, "%s: recovery finished\n",
+ obd2cli_tgt(imp->imp_obd));
+ }
+
+out:
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+ int in_recovery = 1;
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_state == LUSTRE_IMP_FULL ||
+ imp->imp_state == LUSTRE_IMP_CLOSED ||
+ imp->imp_state == LUSTRE_IMP_DISCON ||
+ imp->imp_obd->obd_no_recov)
+ in_recovery = 0;
+ spin_unlock(&imp->imp_lock);
+
+ return in_recovery;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
new file mode 100644
index 000000000..21e9dc9d5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c
@@ -0,0 +1,2459 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+/***********************************************
+ * policy registers *
+ ***********************************************/
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+ NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+ __u16 number = policy->sp_policy;
+
+ LASSERT(policy->sp_name);
+ LASSERT(policy->sp_cops);
+ LASSERT(policy->sp_sops);
+
+ if (number >= SPTLRPC_POLICY_MAX)
+ return -EINVAL;
+
+ write_lock(&policy_lock);
+ if (unlikely(policies[number])) {
+ write_unlock(&policy_lock);
+ return -EALREADY;
+ }
+ policies[number] = policy;
+ write_unlock(&policy_lock);
+
+ CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+ __u16 number = policy->sp_policy;
+
+ LASSERT(number < SPTLRPC_POLICY_MAX);
+
+ write_lock(&policy_lock);
+ if (unlikely(policies[number] == NULL)) {
+ write_unlock(&policy_lock);
+ CERROR("%s: already unregistered\n", policy->sp_name);
+ return -EINVAL;
+ }
+
+ LASSERT(policies[number] == policy);
+ policies[number] = NULL;
+ write_unlock(&policy_lock);
+
+ CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy *sptlrpc_wireflavor2policy(__u32 flavor)
+{
+ static DEFINE_MUTEX(load_mutex);
+ static atomic_t loaded = ATOMIC_INIT(0);
+ struct ptlrpc_sec_policy *policy;
+ __u16 number = SPTLRPC_FLVR_POLICY(flavor);
+ __u16 flag = 0;
+
+ if (number >= SPTLRPC_POLICY_MAX)
+ return NULL;
+
+ while (1) {
+ read_lock(&policy_lock);
+ policy = policies[number];
+ if (policy && !try_module_get(policy->sp_owner))
+ policy = NULL;
+ if (policy == NULL)
+ flag = atomic_read(&loaded);
+ read_unlock(&policy_lock);
+
+ if (policy != NULL || flag != 0 ||
+ number != SPTLRPC_POLICY_GSS)
+ break;
+
+ /* try to load gss module, once */
+ mutex_lock(&load_mutex);
+ if (atomic_read(&loaded) == 0) {
+ if (request_module("ptlrpc_gss") == 0)
+ CDEBUG(D_SEC,
+ "module ptlrpc_gss loaded on demand\n");
+ else
+ CERROR("Unable to load module ptlrpc_gss\n");
+
+ atomic_set(&loaded, 1);
+ }
+ mutex_unlock(&load_mutex);
+ }
+
+ return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+ if (!strcmp(name, "null"))
+ return SPTLRPC_FLVR_NULL;
+ if (!strcmp(name, "plain"))
+ return SPTLRPC_FLVR_PLAIN;
+ if (!strcmp(name, "krb5n"))
+ return SPTLRPC_FLVR_KRB5N;
+ if (!strcmp(name, "krb5a"))
+ return SPTLRPC_FLVR_KRB5A;
+ if (!strcmp(name, "krb5i"))
+ return SPTLRPC_FLVR_KRB5I;
+ if (!strcmp(name, "krb5p"))
+ return SPTLRPC_FLVR_KRB5P;
+
+ return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+ __u32 base = SPTLRPC_FLVR_BASE(flvr);
+
+ if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+ return "null";
+ else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+ return "plain";
+ else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+ return "krb5n";
+ else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+ return "krb5a";
+ else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+ return "krb5i";
+ else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+ return "krb5p";
+
+ CERROR("invalid wire flavor 0x%x\n", flvr);
+ return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+ char *buf, int bufsize)
+{
+ if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+ snprintf(buf, bufsize, "hash:%s",
+ sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+ else
+ snprintf(buf, bufsize, "%s",
+ sptlrpc_flavor2name_base(sf->sf_rpc));
+
+ buf[bufsize - 1] = '\0';
+ return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+ strlcpy(buf, sptlrpc_flavor2name_base(sf->sf_rpc), bufsize);
+
+ /*
+ * currently we don't support customized bulk specification for
+ * flavors other than plain
+ */
+ if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+ char bspec[16];
+
+ bspec[0] = '-';
+ sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+ strlcat(buf, bspec, bufsize);
+ }
+
+ return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+ buf[0] = '\0';
+
+ if (flags & PTLRPC_SEC_FL_REVERSE)
+ strlcat(buf, "reverse,", bufsize);
+ if (flags & PTLRPC_SEC_FL_ROOTONLY)
+ strlcat(buf, "rootonly,", bufsize);
+ if (flags & PTLRPC_SEC_FL_UDESC)
+ strlcat(buf, "udesc,", bufsize);
+ if (flags & PTLRPC_SEC_FL_BULK)
+ strlcat(buf, "bulk,", bufsize);
+ if (buf[0] == '\0')
+ strlcat(buf, "-,", bufsize);
+
+ return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/**************************************************
+ * client context APIs *
+ **************************************************/
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+ struct vfs_cred vcred;
+ int create = 1, remove_dead = 1;
+
+ LASSERT(sec);
+ LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+ if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+ PTLRPC_SEC_FL_ROOTONLY)) {
+ vcred.vc_uid = 0;
+ vcred.vc_gid = 0;
+ if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+ create = 0;
+ remove_dead = 0;
+ }
+ } else {
+ vcred.vc_uid = from_kuid(&init_user_ns, current_uid());
+ vcred.vc_gid = from_kgid(&init_user_ns, current_gid());
+ }
+
+ return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred,
+ create, remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+ atomic_inc(&ctx->cc_refcount);
+ return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+ struct ptlrpc_sec *sec = ctx->cc_sec;
+
+ LASSERT(sec);
+ LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+ if (!atomic_dec_and_test(&ctx->cc_refcount))
+ return;
+
+ sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+ LASSERT(ctx->cc_ops->force_die);
+ ctx->cc_ops->force_die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+ struct ptlrpc_request *req, *next;
+
+ spin_lock(&ctx->cc_lock);
+ list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+ rq_ctx_chain) {
+ list_del_init(&req->rq_ctx_chain);
+ ptlrpc_client_wake_req(req);
+ }
+ spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+ LASSERT(ctx->cc_ops);
+
+ if (ctx->cc_ops->display == NULL)
+ return 0;
+
+ return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+ int adapt = 0;
+
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_sec_expire &&
+ imp->imp_sec_expire < get_seconds()) {
+ adapt = 1;
+ imp->imp_sec_expire = 0;
+ }
+ spin_unlock(&imp->imp_lock);
+
+ if (!adapt)
+ return 0;
+
+ CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+ return sptlrpc_import_sec_adapt(imp, NULL, NULL);
+}
+
+static int import_sec_validate_get(struct obd_import *imp,
+ struct ptlrpc_sec **sec)
+{
+ int rc;
+
+ if (unlikely(imp->imp_sec_expire)) {
+ rc = import_sec_check_expire(imp);
+ if (rc)
+ return rc;
+ }
+
+ *sec = sptlrpc_import_sec_ref(imp);
+ if (*sec == NULL) {
+ CERROR("import %p (%s) with no sec\n",
+ imp, ptlrpc_import_state_name(imp->imp_state));
+ return -EACCES;
+ }
+
+ if (unlikely((*sec)->ps_dying)) {
+ CERROR("attempt to use dying sec %p\n", sec);
+ sptlrpc_sec_put(*sec);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+/**
+ * Given a \a req, find or allocate a appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+ struct obd_import *imp = req->rq_import;
+ struct ptlrpc_sec *sec;
+ int rc;
+
+ LASSERT(!req->rq_cli_ctx);
+ LASSERT(imp);
+
+ rc = import_sec_validate_get(imp, &sec);
+ if (rc)
+ return rc;
+
+ req->rq_cli_ctx = get_my_ctx(sec);
+
+ sptlrpc_sec_put(sec);
+
+ if (!req->rq_cli_ctx) {
+ CERROR("req %p: fail to get context\n", req);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+ LASSERT(req);
+ LASSERT(req->rq_cli_ctx);
+
+ /* request might be asked to release earlier while still
+ * in the context waiting list.
+ */
+ if (!list_empty(&req->rq_ctx_chain)) {
+ spin_lock(&req->rq_cli_ctx->cc_lock);
+ list_del_init(&req->rq_ctx_chain);
+ spin_unlock(&req->rq_cli_ctx->cc_lock);
+ }
+
+ sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+ req->rq_cli_ctx = NULL;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+ struct ptlrpc_cli_ctx *oldctx,
+ struct ptlrpc_cli_ctx *newctx)
+{
+ struct sptlrpc_flavor old_flvr;
+ char *reqmsg = NULL; /* to workaround old gcc */
+ int reqmsg_size;
+ int rc = 0;
+
+ LASSERT(req->rq_reqmsg);
+ LASSERT(req->rq_reqlen);
+ LASSERT(req->rq_replen);
+
+ CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), switch sec %p(%s) -> %p(%s)\n",
+ req,
+ oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec),
+ newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec),
+ oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name,
+ newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name);
+
+ /* save flavor */
+ old_flvr = req->rq_flvr;
+
+ /* save request message */
+ reqmsg_size = req->rq_reqlen;
+ if (reqmsg_size != 0) {
+ OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+ if (reqmsg == NULL)
+ return -ENOMEM;
+ memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+ }
+
+ /* release old req/rep buf */
+ req->rq_cli_ctx = oldctx;
+ sptlrpc_cli_free_reqbuf(req);
+ sptlrpc_cli_free_repbuf(req);
+ req->rq_cli_ctx = newctx;
+
+ /* recalculate the flavor */
+ sptlrpc_req_set_flavor(req, 0);
+
+ /* alloc new request buffer
+ * we don't need to alloc reply buffer here, leave it to the
+ * rest procedure of ptlrpc */
+ if (reqmsg_size != 0) {
+ rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+ if (!rc) {
+ LASSERT(req->rq_reqmsg);
+ memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+ } else {
+ CWARN("failed to alloc reqbuf: %d\n", rc);
+ req->rq_flvr = old_flvr;
+ }
+
+ OBD_FREE_LARGE(reqmsg, reqmsg_size);
+ }
+ return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+ struct ptlrpc_cli_ctx *newctx;
+ int rc;
+
+ LASSERT(oldctx);
+
+ sptlrpc_cli_ctx_get(oldctx);
+ sptlrpc_req_put_ctx(req, 0);
+
+ rc = sptlrpc_req_get_ctx(req);
+ if (unlikely(rc)) {
+ LASSERT(!req->rq_cli_ctx);
+
+ /* restore old ctx */
+ req->rq_cli_ctx = oldctx;
+ return rc;
+ }
+
+ newctx = req->rq_cli_ctx;
+ LASSERT(newctx);
+
+ if (unlikely(newctx == oldctx &&
+ test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+ /*
+ * still get the old dead ctx, usually means system too busy
+ */
+ CDEBUG(D_SEC,
+ "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+ newctx, newctx->cc_flags);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ } else {
+ /*
+ * it's possible newctx == oldctx if we're switching
+ * subflavor with the same sec.
+ */
+ rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+ if (rc) {
+ /* restore old ctx */
+ sptlrpc_req_put_ctx(req, 0);
+ req->rq_cli_ctx = oldctx;
+ return rc;
+ }
+
+ LASSERT(req->rq_cli_ctx == newctx);
+ }
+
+ sptlrpc_cli_ctx_put(oldctx, 1);
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+ if (cli_ctx_is_refreshed(ctx))
+ return 1;
+ return 0;
+}
+
+static
+int ctx_refresh_timeout(void *data)
+{
+ struct ptlrpc_request *req = data;
+ int rc;
+
+ /* conn_cnt is needed in expire_one_request */
+ lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
+
+ rc = ptlrpc_expire_one_request(req, 1);
+ /* if we started recovery, we should mark this ctx dead; otherwise
+ * in case of lgssd died nobody would retire this ctx, following
+ * connecting will still find the same ctx thus cause deadlock.
+ * there's an assumption that expire time of the request should be
+ * later than the context refresh expire time.
+ */
+ if (rc == 0)
+ req->rq_cli_ctx->cc_ops->force_die(req->rq_cli_ctx, 0);
+ return rc;
+}
+
+static
+void ctx_refresh_interrupt(void *data)
+{
+ struct ptlrpc_request *req = data;
+
+ spin_lock(&req->rq_lock);
+ req->rq_intr = 1;
+ spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+ spin_lock(&ctx->cc_lock);
+ if (!list_empty(&req->rq_ctx_chain))
+ list_del_init(&req->rq_ctx_chain);
+ spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - < 0: don't wait
+ * - = 0: wait until success or fatal error occur
+ * - > 0: timeout value (in seconds)
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ struct ptlrpc_sec *sec;
+ struct l_wait_info lwi;
+ int rc;
+
+ LASSERT(ctx);
+
+ if (req->rq_ctx_init || req->rq_ctx_fini)
+ return 0;
+
+ /*
+ * during the process a request's context might change type even
+ * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+ * everything
+ */
+again:
+ rc = import_sec_validate_get(req->rq_import, &sec);
+ if (rc)
+ return rc;
+
+ if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+ CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+ req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+ req_off_ctx_list(req, ctx);
+ sptlrpc_req_replace_dead_ctx(req);
+ ctx = req->rq_cli_ctx;
+ }
+ sptlrpc_sec_put(sec);
+
+ if (cli_ctx_is_eternal(ctx))
+ return 0;
+
+ if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+ LASSERT(ctx->cc_ops->refresh);
+ ctx->cc_ops->refresh(ctx);
+ }
+ LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+ LASSERT(ctx->cc_ops->validate);
+ if (ctx->cc_ops->validate(ctx) == 0) {
+ req_off_ctx_list(req, ctx);
+ return 0;
+ }
+
+ if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+ spin_lock(&req->rq_lock);
+ req->rq_err = 1;
+ spin_unlock(&req->rq_lock);
+ req_off_ctx_list(req, ctx);
+ return -EPERM;
+ }
+
+ /*
+ * There's a subtle issue for resending RPCs, suppose following
+ * situation:
+ * 1. the request was sent to server.
+ * 2. recovery was kicked start, after finished the request was
+ * marked as resent.
+ * 3. resend the request.
+ * 4. old reply from server received, we accept and verify the reply.
+ * this has to be success, otherwise the error will be aware
+ * by application.
+ * 5. new reply from server received, dropped by LNet.
+ *
+ * Note the xid of old & new request is the same. We can't simply
+ * change xid for the resent request because the server replies on
+ * it for reply reconstruction.
+ *
+ * Commonly the original context should be uptodate because we
+ * have a expiry nice time; server will keep its context because
+ * we at least hold a ref of old context which prevent context
+ * destroying RPC being sent. So server still can accept the request
+ * and finish the RPC. But if that's not the case:
+ * 1. If server side context has been trimmed, a NO_CONTEXT will
+ * be returned, gss_cli_ctx_verify/unseal will switch to new
+ * context by force.
+ * 2. Current context never be refreshed, then we are fine: we
+ * never really send request with old context before.
+ */
+ if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+ unlikely(req->rq_reqmsg) &&
+ lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+ req_off_ctx_list(req, ctx);
+ return 0;
+ }
+
+ if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+ req_off_ctx_list(req, ctx);
+ /*
+ * don't switch ctx if import was deactivated
+ */
+ if (req->rq_import->imp_deactive) {
+ spin_lock(&req->rq_lock);
+ req->rq_err = 1;
+ spin_unlock(&req->rq_lock);
+ return -EINTR;
+ }
+
+ rc = sptlrpc_req_replace_dead_ctx(req);
+ if (rc) {
+ LASSERT(ctx == req->rq_cli_ctx);
+ CERROR("req %p: failed to replace dead ctx %p: %d\n",
+ req, ctx, rc);
+ spin_lock(&req->rq_lock);
+ req->rq_err = 1;
+ spin_unlock(&req->rq_lock);
+ return rc;
+ }
+
+ ctx = req->rq_cli_ctx;
+ goto again;
+ }
+
+ /*
+ * Now we're sure this context is during upcall, add myself into
+ * waiting list
+ */
+ spin_lock(&ctx->cc_lock);
+ if (list_empty(&req->rq_ctx_chain))
+ list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+ spin_unlock(&ctx->cc_lock);
+
+ if (timeout < 0)
+ return -EWOULDBLOCK;
+
+ /* Clear any flags that may be present from previous sends */
+ LASSERT(req->rq_receiving_reply == 0);
+ spin_lock(&req->rq_lock);
+ req->rq_err = 0;
+ req->rq_timedout = 0;
+ req->rq_resend = 0;
+ req->rq_restart = 0;
+ spin_unlock(&req->rq_lock);
+
+ lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout,
+ ctx_refresh_interrupt, req);
+ rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi);
+
+ /*
+ * following cases could lead us here:
+ * - successfully refreshed;
+ * - interrupted;
+ * - timedout, and we don't want recover from the failure;
+ * - timedout, and waked up upon recovery finished;
+ * - someone else mark this ctx dead by force;
+ * - someone invalidate the req and call ptlrpc_client_wake_req(),
+ * e.g. ptlrpc_abort_inflight();
+ */
+ if (!cli_ctx_is_refreshed(ctx)) {
+ /* timed out or interrupted */
+ req_off_ctx_list(req, ctx);
+
+ LASSERT(rc != 0);
+ return rc;
+ }
+
+ goto again;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+ struct ptlrpc_sec *sec;
+
+ LASSERT(req->rq_import);
+ LASSERT(req->rq_cli_ctx);
+ LASSERT(req->rq_cli_ctx->cc_sec);
+ LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+ /* special security flags according to opcode */
+ switch (opcode) {
+ case OST_READ:
+ case MDS_READPAGE:
+ case MGS_CONFIG_READ:
+ case OBD_IDX_READ:
+ req->rq_bulk_read = 1;
+ break;
+ case OST_WRITE:
+ case MDS_WRITEPAGE:
+ req->rq_bulk_write = 1;
+ break;
+ case SEC_CTX_INIT:
+ req->rq_ctx_init = 1;
+ break;
+ case SEC_CTX_FINI:
+ req->rq_ctx_fini = 1;
+ break;
+ case 0:
+ /* init/fini rpc won't be resend, so can't be here */
+ LASSERT(req->rq_ctx_init == 0);
+ LASSERT(req->rq_ctx_fini == 0);
+
+ /* cleanup flags, which should be recalculated */
+ req->rq_pack_udesc = 0;
+ req->rq_pack_bulk = 0;
+ break;
+ }
+
+ sec = req->rq_cli_ctx->cc_sec;
+
+ spin_lock(&sec->ps_lock);
+ req->rq_flvr = sec->ps_flvr;
+ spin_unlock(&sec->ps_lock);
+
+ /* force SVC_NULL for context initiation rpc, SVC_INTG for context
+ * destruction rpc */
+ if (unlikely(req->rq_ctx_init))
+ flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+ else if (unlikely(req->rq_ctx_fini))
+ flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+ /* user descriptor flag, null security can't do it anyway */
+ if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+ (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+ req->rq_pack_udesc = 1;
+
+ /* bulk security flag */
+ if ((req->rq_bulk_read || req->rq_bulk_write) &&
+ sptlrpc_flavor_has_bulk(&req->rq_flvr))
+ req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+ if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+ return;
+
+ LASSERT(req->rq_clrbuf);
+ if (req->rq_pool || !req->rq_reqbuf)
+ return;
+
+ OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+ req->rq_reqbuf = NULL;
+ req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+ struct ptlrpc_sec *sec;
+ struct ptlrpc_cli_ctx *ctx;
+ struct ptlrpc_request *req = NULL;
+ int rc;
+
+ might_sleep();
+
+ sec = sptlrpc_import_sec_ref(imp);
+ ctx = get_my_ctx(sec);
+ sptlrpc_sec_put(sec);
+
+ if (!ctx)
+ return -ENOMEM;
+
+ if (cli_ctx_is_eternal(ctx) ||
+ ctx->cc_ops->validate(ctx) == 0) {
+ sptlrpc_cli_ctx_put(ctx, 1);
+ return 0;
+ }
+
+ if (cli_ctx_is_error(ctx)) {
+ sptlrpc_cli_ctx_put(ctx, 1);
+ return -EACCES;
+ }
+
+ req = ptlrpc_request_cache_alloc(GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
+ spin_lock_init(&req->rq_lock);
+ atomic_set(&req->rq_refcount, 10000);
+ INIT_LIST_HEAD(&req->rq_ctx_chain);
+ init_waitqueue_head(&req->rq_reply_waitq);
+ init_waitqueue_head(&req->rq_set_waitq);
+ req->rq_import = imp;
+ req->rq_flvr = sec->ps_flvr;
+ req->rq_cli_ctx = ctx;
+
+ rc = sptlrpc_req_refresh_ctx(req, 0);
+ LASSERT(list_empty(&req->rq_ctx_chain));
+ sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+ ptlrpc_request_cache_free(req);
+
+ return rc;
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ int rc = 0;
+
+ LASSERT(ctx);
+ LASSERT(ctx->cc_sec);
+ LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+ /* we wrap bulk request here because now we can be sure
+ * the context is uptodate.
+ */
+ if (req->rq_bulk) {
+ rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+ if (rc)
+ return rc;
+ }
+
+ switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+ case SPTLRPC_SVC_NULL:
+ case SPTLRPC_SVC_AUTH:
+ case SPTLRPC_SVC_INTG:
+ LASSERT(ctx->cc_ops->sign);
+ rc = ctx->cc_ops->sign(ctx, req);
+ break;
+ case SPTLRPC_SVC_PRIV:
+ LASSERT(ctx->cc_ops->seal);
+ rc = ctx->cc_ops->seal(ctx, req);
+ break;
+ default:
+ LBUG();
+ }
+
+ if (rc == 0) {
+ LASSERT(req->rq_reqdata_len);
+ LASSERT(req->rq_reqdata_len % 8 == 0);
+ LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+ }
+
+ return rc;
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ int rc;
+
+ LASSERT(ctx);
+ LASSERT(ctx->cc_sec);
+ LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_repdata);
+ LASSERT(req->rq_repmsg == NULL);
+
+ req->rq_rep_swab_mask = 0;
+
+ rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+ switch (rc) {
+ case 1:
+ lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+ case 0:
+ break;
+ default:
+ CERROR("failed unpack reply: x%llu\n", req->rq_xid);
+ return -EPROTO;
+ }
+
+ if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+ CERROR("replied data length %d too small\n",
+ req->rq_repdata_len);
+ return -EPROTO;
+ }
+
+ if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+ SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+ CERROR("reply policy %u doesn't match request policy %u\n",
+ SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+ SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+ return -EPROTO;
+ }
+
+ switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+ case SPTLRPC_SVC_NULL:
+ case SPTLRPC_SVC_AUTH:
+ case SPTLRPC_SVC_INTG:
+ LASSERT(ctx->cc_ops->verify);
+ rc = ctx->cc_ops->verify(ctx, req);
+ break;
+ case SPTLRPC_SVC_PRIV:
+ LASSERT(ctx->cc_ops->unseal);
+ rc = ctx->cc_ops->unseal(ctx, req);
+ break;
+ default:
+ LBUG();
+ }
+ LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+ if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+ !req->rq_ctx_init)
+ req->rq_rep_swab_mask = 0;
+ return rc;
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_repdata == NULL);
+ LASSERT(req->rq_repmsg == NULL);
+ LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+ if (req->rq_reply_off == 0 &&
+ (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+ CERROR("real reply with offset 0\n");
+ return -EPROTO;
+ }
+
+ if (req->rq_reply_off % 8 != 0) {
+ CERROR("reply at odd offset %u\n", req->rq_reply_off);
+ return -EPROTO;
+ }
+
+ req->rq_repdata = (struct lustre_msg *)
+ (req->rq_repbuf + req->rq_reply_off);
+ req->rq_repdata_len = req->rq_nob_received;
+
+ return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ *
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+ struct ptlrpc_request **req_ret)
+{
+ struct ptlrpc_request *early_req;
+ char *early_buf;
+ int early_bufsz, early_size;
+ int rc;
+
+ early_req = ptlrpc_request_cache_alloc(GFP_NOFS);
+ if (early_req == NULL)
+ return -ENOMEM;
+
+ early_size = req->rq_nob_received;
+ early_bufsz = size_roundup_power2(early_size);
+ OBD_ALLOC_LARGE(early_buf, early_bufsz);
+ if (early_buf == NULL) {
+ rc = -ENOMEM;
+ goto err_req;
+ }
+
+ /* sanity checkings and copy data out, do it inside spinlock */
+ spin_lock(&req->rq_lock);
+
+ if (req->rq_replied) {
+ spin_unlock(&req->rq_lock);
+ rc = -EALREADY;
+ goto err_buf;
+ }
+
+ LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_repdata == NULL);
+ LASSERT(req->rq_repmsg == NULL);
+
+ if (req->rq_reply_off != 0) {
+ CERROR("early reply with offset %u\n", req->rq_reply_off);
+ spin_unlock(&req->rq_lock);
+ rc = -EPROTO;
+ goto err_buf;
+ }
+
+ if (req->rq_nob_received != early_size) {
+ /* even another early arrived the size should be the same */
+ CERROR("data size has changed from %u to %u\n",
+ early_size, req->rq_nob_received);
+ spin_unlock(&req->rq_lock);
+ rc = -EINVAL;
+ goto err_buf;
+ }
+
+ if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+ CERROR("early reply length %d too small\n",
+ req->rq_nob_received);
+ spin_unlock(&req->rq_lock);
+ rc = -EALREADY;
+ goto err_buf;
+ }
+
+ memcpy(early_buf, req->rq_repbuf, early_size);
+ spin_unlock(&req->rq_lock);
+
+ spin_lock_init(&early_req->rq_lock);
+ early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+ early_req->rq_flvr = req->rq_flvr;
+ early_req->rq_repbuf = early_buf;
+ early_req->rq_repbuf_len = early_bufsz;
+ early_req->rq_repdata = (struct lustre_msg *) early_buf;
+ early_req->rq_repdata_len = early_size;
+ early_req->rq_early = 1;
+ early_req->rq_reqmsg = req->rq_reqmsg;
+
+ rc = do_cli_unwrap_reply(early_req);
+ if (rc) {
+ DEBUG_REQ(D_ADAPTTO, early_req,
+ "error %d unwrap early reply", rc);
+ goto err_ctx;
+ }
+
+ LASSERT(early_req->rq_repmsg);
+ *req_ret = early_req;
+ return 0;
+
+err_ctx:
+ sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+ OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+ ptlrpc_request_cache_free(early_req);
+ return rc;
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+ LASSERT(early_req->rq_repbuf);
+ LASSERT(early_req->rq_repdata);
+ LASSERT(early_req->rq_repmsg);
+
+ sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+ OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+ ptlrpc_request_cache_free(early_req);
+}
+
+/**************************************************
+ * sec ID *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+ return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/**************************************************
+ * client side high-level security APIs *
+ **************************************************/
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+ int grace, int force)
+{
+ struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+ LASSERT(policy->sp_cops);
+ LASSERT(policy->sp_cops->flush_ctx_cache);
+
+ return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+ struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+ LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+ LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+ LASSERT(policy->sp_cops->destroy_sec);
+
+ CDEBUG(D_SEC, "%s@%p: being destroyed\n", sec->ps_policy->sp_name, sec);
+
+ policy->sp_cops->destroy_sec(sec);
+ sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+ sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+ LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+ if (sec->ps_policy->sp_cops->kill_sec) {
+ sec->ps_policy->sp_cops->kill_sec(sec);
+
+ sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+ }
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+ if (sec)
+ atomic_inc(&sec->ps_refcount);
+
+ return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+ if (sec) {
+ LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+ if (atomic_dec_and_test(&sec->ps_refcount)) {
+ sptlrpc_gc_del_sec(sec);
+ sec_cop_destroy_sec(sec);
+ }
+ }
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking reference of import
+ */
+static
+struct ptlrpc_sec *sptlrpc_sec_create(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *svc_ctx,
+ struct sptlrpc_flavor *sf,
+ enum lustre_sec_part sp)
+{
+ struct ptlrpc_sec_policy *policy;
+ struct ptlrpc_sec *sec;
+ char str[32];
+
+ if (svc_ctx) {
+ LASSERT(imp->imp_dlm_fake == 1);
+
+ CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+ imp->imp_obd->obd_type->typ_name,
+ imp->imp_obd->obd_name,
+ sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+ policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+ sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+ } else {
+ LASSERT(imp->imp_dlm_fake == 0);
+
+ CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+ imp->imp_obd->obd_type->typ_name,
+ imp->imp_obd->obd_name,
+ sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+ policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+ if (!policy) {
+ CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+ return NULL;
+ }
+ }
+
+ sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+ if (sec) {
+ atomic_inc(&sec->ps_refcount);
+
+ sec->ps_part = sp;
+
+ if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+ sptlrpc_gc_add_sec(sec);
+ } else {
+ sptlrpc_policy_put(policy);
+ }
+
+ return sec;
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+ struct ptlrpc_sec *sec;
+
+ spin_lock(&imp->imp_lock);
+ sec = sptlrpc_sec_get(imp->imp_sec);
+ spin_unlock(&imp->imp_lock);
+
+ return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+ struct ptlrpc_sec *sec)
+{
+ struct ptlrpc_sec *old_sec;
+
+ LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+ spin_lock(&imp->imp_lock);
+ old_sec = imp->imp_sec;
+ imp->imp_sec = sec;
+ spin_unlock(&imp->imp_lock);
+
+ if (old_sec) {
+ sptlrpc_sec_kill(old_sec);
+
+ /* balance the ref taken by this import */
+ sptlrpc_sec_put(old_sec);
+ }
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+ return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+ *dst = *src;
+}
+
+static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp,
+ struct ptlrpc_sec *sec,
+ struct sptlrpc_flavor *sf)
+{
+ char str1[32], str2[32];
+
+ if (sec->ps_flvr.sf_flags != sf->sf_flags)
+ CDEBUG(D_SEC, "changing sec flags: %s -> %s\n",
+ sptlrpc_secflags2str(sec->ps_flvr.sf_flags,
+ str1, sizeof(str1)),
+ sptlrpc_secflags2str(sf->sf_flags,
+ str2, sizeof(str2)));
+
+ spin_lock(&sec->ps_lock);
+ flavor_copy(&sec->ps_flvr, sf);
+ spin_unlock(&sec->ps_lock);
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ * - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ * - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *svc_ctx,
+ struct sptlrpc_flavor *flvr)
+{
+ struct ptlrpc_connection *conn;
+ struct sptlrpc_flavor sf;
+ struct ptlrpc_sec *sec, *newsec;
+ enum lustre_sec_part sp;
+ char str[24];
+ int rc = 0;
+
+ might_sleep();
+
+ if (imp == NULL)
+ return 0;
+
+ conn = imp->imp_connection;
+
+ if (svc_ctx == NULL) {
+ struct client_obd *cliobd = &imp->imp_obd->u.cli;
+ /*
+ * normal import, determine flavor from rule set, except
+ * for mgc the flavor is predetermined.
+ */
+ if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+ sf = cliobd->cl_flvr_mgc;
+ else
+ sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+ cliobd->cl_sp_to,
+ &cliobd->cl_target_uuid,
+ conn->c_self, &sf);
+
+ sp = imp->imp_obd->u.cli.cl_sp_me;
+ } else {
+ /* reverse import, determine flavor from incoming request */
+ sf = *flvr;
+
+ if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+ sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+ PTLRPC_SEC_FL_ROOTONLY;
+
+ sp = sptlrpc_target_sec_part(imp->imp_obd);
+ }
+
+ sec = sptlrpc_import_sec_ref(imp);
+ if (sec) {
+ char str2[24];
+
+ if (flavor_equal(&sf, &sec->ps_flvr))
+ goto out;
+
+ CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+ imp->imp_obd->obd_name,
+ obd_uuid2str(&conn->c_remote_uuid),
+ sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+ sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+
+ if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) ==
+ SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) &&
+ SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
+ SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
+ sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
+ goto out;
+ }
+ } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+ SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+ CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+ imp->imp_obd->obd_name,
+ obd_uuid2str(&conn->c_remote_uuid),
+ LNET_NIDNET(conn->c_self),
+ sptlrpc_flavor2name(&sf, str, sizeof(str)));
+ }
+
+ mutex_lock(&imp->imp_sec_mutex);
+
+ newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+ if (newsec) {
+ sptlrpc_import_sec_install(imp, newsec);
+ } else {
+ CERROR("import %s->%s: failed to create new sec\n",
+ imp->imp_obd->obd_name,
+ obd_uuid2str(&conn->c_remote_uuid));
+ rc = -EPERM;
+ }
+
+ mutex_unlock(&imp->imp_sec_mutex);
+out:
+ sptlrpc_sec_put(sec);
+ return rc;
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+ if (imp->imp_sec) {
+ sptlrpc_sec_kill(imp->imp_sec);
+
+ sptlrpc_sec_put(imp->imp_sec);
+ imp->imp_sec = NULL;
+ }
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+ uid_t uid, int grace, int force)
+{
+ struct ptlrpc_sec *sec;
+
+ if (imp == NULL)
+ return;
+
+ sec = sptlrpc_import_sec_ref(imp);
+ if (sec == NULL)
+ return;
+
+ sec_cop_flush_ctx_cache(sec, uid, grace, force);
+ sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+ /* it's important to use grace mode, see explain in
+ * sptlrpc_req_refresh_ctx() */
+ import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+ import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()),
+ 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+ import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ struct ptlrpc_sec_policy *policy;
+ int rc;
+
+ LASSERT(ctx);
+ LASSERT(ctx->cc_sec);
+ LASSERT(ctx->cc_sec->ps_policy);
+ LASSERT(req->rq_reqmsg == NULL);
+ LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+ policy = ctx->cc_sec->ps_policy;
+ rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+ if (!rc) {
+ LASSERT(req->rq_reqmsg);
+ LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+ /* zeroing preallocated buffer */
+ if (req->rq_pool)
+ memset(req->rq_reqmsg, 0, msgsize);
+ }
+
+ return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ struct ptlrpc_sec_policy *policy;
+
+ LASSERT(ctx);
+ LASSERT(ctx->cc_sec);
+ LASSERT(ctx->cc_sec->ps_policy);
+ LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+ if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+ return;
+
+ policy = ctx->cc_sec->ps_policy;
+ policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+ req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+ int segment, int newsize)
+{
+ void *src, *dst;
+ int oldsize, oldmsg_size, movesize;
+
+ LASSERT(segment < msg->lm_bufcount);
+ LASSERT(msg->lm_buflens[segment] <= newsize);
+
+ if (msg->lm_buflens[segment] == newsize)
+ return;
+
+ /* nothing to do if we are enlarging the last segment */
+ if (segment == msg->lm_bufcount - 1) {
+ msg->lm_buflens[segment] = newsize;
+ return;
+ }
+
+ oldsize = msg->lm_buflens[segment];
+
+ src = lustre_msg_buf(msg, segment + 1, 0);
+ msg->lm_buflens[segment] = newsize;
+ dst = lustre_msg_buf(msg, segment + 1, 0);
+ msg->lm_buflens[segment] = oldsize;
+
+ /* move from segment + 1 to end segment */
+ LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+ oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+ movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+ LASSERT(movesize >= 0);
+
+ if (movesize)
+ memmove(dst, src, movesize);
+
+ /* note we don't clear the ares where old data live, not secret */
+
+ /* finally set new segment size */
+ msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+ int segment, int newsize)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ struct ptlrpc_sec_cops *cops;
+ struct lustre_msg *msg = req->rq_reqmsg;
+
+ LASSERT(ctx);
+ LASSERT(msg);
+ LASSERT(msg->lm_bufcount > segment);
+ LASSERT(msg->lm_buflens[segment] <= newsize);
+
+ if (msg->lm_buflens[segment] == newsize)
+ return 0;
+
+ cops = ctx->cc_sec->ps_policy->sp_cops;
+ LASSERT(cops->enlarge_reqbuf);
+ return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ struct ptlrpc_sec_policy *policy;
+
+ LASSERT(ctx);
+ LASSERT(ctx->cc_sec);
+ LASSERT(ctx->cc_sec->ps_policy);
+
+ if (req->rq_repbuf)
+ return 0;
+
+ policy = ctx->cc_sec->ps_policy;
+ return policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize);
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+ struct ptlrpc_sec_policy *policy;
+
+ LASSERT(ctx);
+ LASSERT(ctx->cc_sec);
+ LASSERT(ctx->cc_sec->ps_policy);
+ LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+ if (req->rq_repbuf == NULL)
+ return;
+ LASSERT(req->rq_repbuf_len);
+
+ policy = ctx->cc_sec->ps_policy;
+ policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+ req->rq_repmsg = NULL;
+}
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+ struct ptlrpc_cli_ctx *ctx)
+{
+ struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+ if (!policy->sp_cops->install_rctx)
+ return 0;
+ return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *ctx)
+{
+ struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+ if (!policy->sp_sops->install_rctx)
+ return 0;
+ return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/****************************************
+ * server side security *
+ ****************************************/
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+ struct ptlrpc_request *req)
+{
+ struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+ if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+ return 1;
+
+ if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+ SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+ SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+ SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+ return 1;
+
+ return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+ struct ptlrpc_request *req)
+{
+ struct sptlrpc_flavor flavor;
+
+ if (exp == NULL)
+ return 0;
+
+ /* client side export has no imp_reverse, skip
+ * FIXME maybe we should check flavor this as well??? */
+ if (exp->exp_imp_reverse == NULL)
+ return 0;
+
+ /* don't care about ctx fini rpc */
+ if (req->rq_ctx_fini)
+ return 0;
+
+ spin_lock(&exp->exp_lock);
+
+ /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+ * the first req with the new flavor, then treat it as current flavor,
+ * adapt reverse sec according to it.
+ * note the first rpc with new flavor might not be with root ctx, in
+ * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */
+ if (unlikely(exp->exp_flvr_changed) &&
+ flavor_allowed(&exp->exp_flvr_old[1], req)) {
+ /* make the new flavor as "current", and old ones as
+ * about-to-expire */
+ CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+ exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+ flavor = exp->exp_flvr_old[1];
+ exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+ exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+ exp->exp_flvr_old[0] = exp->exp_flvr;
+ exp->exp_flvr_expire[0] = get_seconds() +
+ EXP_FLVR_UPDATE_EXPIRE;
+ exp->exp_flvr = flavor;
+
+ /* flavor change finished */
+ exp->exp_flvr_changed = 0;
+ LASSERT(exp->exp_flvr_adapt == 1);
+
+ /* if it's gss, we only interested in root ctx init */
+ if (req->rq_auth_gss &&
+ !(req->rq_ctx_init &&
+ (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+ req->rq_auth_usr_ost))) {
+ spin_unlock(&exp->exp_lock);
+ CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+ req->rq_auth_gss, req->rq_ctx_init,
+ req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+ req->rq_auth_usr_ost);
+ return 0;
+ }
+
+ exp->exp_flvr_adapt = 0;
+ spin_unlock(&exp->exp_lock);
+
+ return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+ req->rq_svc_ctx, &flavor);
+ }
+
+ /* if it equals to the current flavor, we accept it, but need to
+ * dealing with reverse sec/ctx */
+ if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+ /* most cases should return here, we only interested in
+ * gss root ctx init */
+ if (!req->rq_auth_gss || !req->rq_ctx_init ||
+ (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+ !req->rq_auth_usr_ost)) {
+ spin_unlock(&exp->exp_lock);
+ return 0;
+ }
+
+ /* if flavor just changed, we should not proceed, just leave
+ * it and current flavor will be discovered and replaced
+ * shortly, and let _this_ rpc pass through */
+ if (exp->exp_flvr_changed) {
+ LASSERT(exp->exp_flvr_adapt);
+ spin_unlock(&exp->exp_lock);
+ return 0;
+ }
+
+ if (exp->exp_flvr_adapt) {
+ exp->exp_flvr_adapt = 0;
+ CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+ exp, exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[0].sf_rpc,
+ exp->exp_flvr_old[1].sf_rpc);
+ flavor = exp->exp_flvr;
+ spin_unlock(&exp->exp_lock);
+
+ return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+ req->rq_svc_ctx,
+ &flavor);
+ } else {
+ CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, install rvs ctx\n",
+ exp, exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[0].sf_rpc,
+ exp->exp_flvr_old[1].sf_rpc);
+ spin_unlock(&exp->exp_lock);
+
+ return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+ req->rq_svc_ctx);
+ }
+ }
+
+ if (exp->exp_flvr_expire[0]) {
+ if (exp->exp_flvr_expire[0] >= get_seconds()) {
+ if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+ CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the middle one (" CFS_DURATION_T ")\n", exp,
+ exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[0].sf_rpc,
+ exp->exp_flvr_old[1].sf_rpc,
+ exp->exp_flvr_expire[0] -
+ get_seconds());
+ spin_unlock(&exp->exp_lock);
+ return 0;
+ }
+ } else {
+ CDEBUG(D_SEC, "mark middle expired\n");
+ exp->exp_flvr_expire[0] = 0;
+ }
+ CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+ exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+ req->rq_flvr.sf_rpc);
+ }
+
+ /* now it doesn't match the current flavor, the only chance we can
+ * accept it is match the old flavors which is not expired. */
+ if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+ if (exp->exp_flvr_expire[1] >= get_seconds()) {
+ if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+ CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (" CFS_DURATION_T ")\n",
+ exp,
+ exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[0].sf_rpc,
+ exp->exp_flvr_old[1].sf_rpc,
+ exp->exp_flvr_expire[1] -
+ get_seconds());
+ spin_unlock(&exp->exp_lock);
+ return 0;
+ }
+ } else {
+ CDEBUG(D_SEC, "mark oldest expired\n");
+ exp->exp_flvr_expire[1] = 0;
+ }
+ CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+ exp, exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+ req->rq_flvr.sf_rpc);
+ } else {
+ CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+ exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+ exp->exp_flvr_old[1].sf_rpc);
+ }
+
+ spin_unlock(&exp->exp_lock);
+
+ CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n",
+ exp, exp->exp_obd->obd_name,
+ req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+ req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+ req->rq_flvr.sf_rpc,
+ exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[0].sf_rpc,
+ exp->exp_flvr_expire[0] ?
+ (unsigned long) (exp->exp_flvr_expire[0] -
+ get_seconds()) : 0,
+ exp->exp_flvr_old[1].sf_rpc,
+ exp->exp_flvr_expire[1] ?
+ (unsigned long) (exp->exp_flvr_expire[1] -
+ get_seconds()) : 0);
+ return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+ struct sptlrpc_rule_set *rset)
+{
+ struct obd_export *exp;
+ struct sptlrpc_flavor new_flvr;
+
+ LASSERT(obd);
+
+ spin_lock(&obd->obd_dev_lock);
+
+ list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+ if (exp->exp_connection == NULL)
+ continue;
+
+ /* note if this export had just been updated flavor
+ * (exp_flvr_changed == 1), this will override the
+ * previous one. */
+ spin_lock(&exp->exp_lock);
+ sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+ exp->exp_connection->c_peer.nid,
+ &new_flvr);
+ if (exp->exp_flvr_changed ||
+ !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+ exp->exp_flvr_old[1] = new_flvr;
+ exp->exp_flvr_expire[1] = 0;
+ exp->exp_flvr_changed = 1;
+ exp->exp_flvr_adapt = 1;
+
+ CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+ exp, sptlrpc_part2name(exp->exp_sp_peer),
+ exp->exp_flvr.sf_rpc,
+ exp->exp_flvr_old[1].sf_rpc);
+ }
+ spin_unlock(&exp->exp_lock);
+ }
+
+ spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+ /* peer's claim is unreliable unless gss is being used */
+ if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+ return svc_rc;
+
+ switch (req->rq_sp_from) {
+ case LUSTRE_SP_CLI:
+ if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+ DEBUG_REQ(D_ERROR, req, "faked source CLI");
+ svc_rc = SECSVC_DROP;
+ }
+ break;
+ case LUSTRE_SP_MDT:
+ if (!req->rq_auth_usr_mdt) {
+ DEBUG_REQ(D_ERROR, req, "faked source MDT");
+ svc_rc = SECSVC_DROP;
+ }
+ break;
+ case LUSTRE_SP_OST:
+ if (!req->rq_auth_usr_ost) {
+ DEBUG_REQ(D_ERROR, req, "faked source OST");
+ svc_rc = SECSVC_DROP;
+ }
+ break;
+ case LUSTRE_SP_MGS:
+ case LUSTRE_SP_MGC:
+ if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+ !req->rq_auth_usr_ost) {
+ DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+ svc_rc = SECSVC_DROP;
+ }
+ break;
+ case LUSTRE_SP_ANY:
+ default:
+ DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+ svc_rc = SECSVC_DROP;
+ }
+
+ return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with a incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+ struct ptlrpc_sec_policy *policy;
+ struct lustre_msg *msg = req->rq_reqbuf;
+ int rc;
+
+ LASSERT(msg);
+ LASSERT(req->rq_reqmsg == NULL);
+ LASSERT(req->rq_repmsg == NULL);
+ LASSERT(req->rq_svc_ctx == NULL);
+
+ req->rq_req_swab_mask = 0;
+
+ rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+ switch (rc) {
+ case 1:
+ lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+ case 0:
+ break;
+ default:
+ CERROR("error unpacking request from %s x%llu\n",
+ libcfs_id2str(req->rq_peer), req->rq_xid);
+ return SECSVC_DROP;
+ }
+
+ req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+ req->rq_sp_from = LUSTRE_SP_ANY;
+ req->rq_auth_uid = -1;
+ req->rq_auth_mapped_uid = -1;
+
+ policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+ if (!policy) {
+ CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+ return SECSVC_DROP;
+ }
+
+ LASSERT(policy->sp_sops->accept);
+ rc = policy->sp_sops->accept(req);
+ sptlrpc_policy_put(policy);
+ LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+ LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+ /*
+ * if it's not null flavor (which means embedded packing msg),
+ * reset the swab mask for the coming inner msg unpacking.
+ */
+ if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+ req->rq_req_swab_mask = 0;
+
+ /* sanity check for the request source */
+ rc = sptlrpc_svc_check_from(req, rc);
+ return rc;
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+ struct ptlrpc_sec_policy *policy;
+ struct ptlrpc_reply_state *rs;
+ int rc;
+
+ LASSERT(req->rq_svc_ctx);
+ LASSERT(req->rq_svc_ctx->sc_policy);
+
+ policy = req->rq_svc_ctx->sc_policy;
+ LASSERT(policy->sp_sops->alloc_rs);
+
+ rc = policy->sp_sops->alloc_rs(req, msglen);
+ if (unlikely(rc == -ENOMEM)) {
+ struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+ if (svcpt->scp_service->srv_max_reply_size <
+ msglen + sizeof(struct ptlrpc_reply_state)) {
+ /* Just return failure if the size is too big */
+ CERROR("size of message is too big (%zd), %d allowed",
+ msglen + sizeof(struct ptlrpc_reply_state),
+ svcpt->scp_service->srv_max_reply_size);
+ return -ENOMEM;
+ }
+
+ /* failed alloc, try emergency pool */
+ rs = lustre_get_emerg_rs(svcpt);
+ if (rs == NULL)
+ return -ENOMEM;
+
+ req->rq_reply_state = rs;
+ rc = policy->sp_sops->alloc_rs(req, msglen);
+ if (rc) {
+ lustre_put_emerg_rs(rs);
+ req->rq_reply_state = NULL;
+ }
+ }
+
+ LASSERT(rc != 0 ||
+ (req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+ return rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to appropriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_sec_policy *policy;
+ int rc;
+
+ LASSERT(req->rq_svc_ctx);
+ LASSERT(req->rq_svc_ctx->sc_policy);
+
+ policy = req->rq_svc_ctx->sc_policy;
+ LASSERT(policy->sp_sops->authorize);
+
+ rc = policy->sp_sops->authorize(req);
+ LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+ return rc;
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+ struct ptlrpc_sec_policy *policy;
+ unsigned int prealloc;
+
+ LASSERT(rs->rs_svc_ctx);
+ LASSERT(rs->rs_svc_ctx->sc_policy);
+
+ policy = rs->rs_svc_ctx->sc_policy;
+ LASSERT(policy->sp_sops->free_rs);
+
+ prealloc = rs->rs_prealloc;
+ policy->sp_sops->free_rs(rs);
+
+ if (prealloc)
+ lustre_put_emerg_rs(rs);
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+ struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+ if (ctx != NULL)
+ atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+ struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+ if (ctx == NULL)
+ return;
+
+ LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+ if (atomic_dec_and_test(&ctx->sc_refcount)) {
+ if (ctx->sc_policy->sp_sops->free_ctx)
+ ctx->sc_policy->sp_sops->free_ctx(ctx);
+ }
+ req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+ struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+ if (ctx == NULL)
+ return;
+
+ LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+ if (ctx->sc_policy->sp_sops->invalidate_ctx)
+ ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/****************************************
+ * bulk security *
+ ****************************************/
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc)
+{
+ struct ptlrpc_cli_ctx *ctx;
+
+ LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+ if (!req->rq_pack_bulk)
+ return 0;
+
+ ctx = req->rq_cli_ctx;
+ if (ctx->cc_ops->wrap_bulk)
+ return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc,
+ int nob)
+{
+ struct ptlrpc_cli_ctx *ctx;
+ int rc;
+
+ LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+ if (!req->rq_pack_bulk)
+ return desc->bd_nob_transferred;
+
+ ctx = req->rq_cli_ctx;
+ if (ctx->cc_ops->unwrap_bulk) {
+ rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+ if (rc < 0)
+ return rc;
+ }
+ return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc)
+{
+ struct ptlrpc_cli_ctx *ctx;
+ int rc;
+
+ LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+ if (!req->rq_pack_bulk)
+ return 0;
+
+ ctx = req->rq_cli_ctx;
+ if (ctx->cc_ops->unwrap_bulk) {
+ rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+ if (rc < 0)
+ return rc;
+ }
+
+ /*
+ * if everything is going right, nob should equals to nob_transferred.
+ * in case of privacy mode, nob_transferred needs to be adjusted.
+ */
+ if (desc->bd_nob != desc->bd_nob_transferred) {
+ CERROR("nob %d doesn't match transferred nob %d",
+ desc->bd_nob, desc->bd_nob_transferred);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+
+/****************************************
+ * user descriptor helpers *
+ ****************************************/
+
+int sptlrpc_current_user_desc_size(void)
+{
+ int ngroups;
+
+ ngroups = current_ngroups;
+
+ if (ngroups > LUSTRE_MAX_GROUPS)
+ ngroups = LUSTRE_MAX_GROUPS;
+ return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+ struct ptlrpc_user_desc *pud;
+
+ pud = lustre_msg_buf(msg, offset, 0);
+
+ pud->pud_uid = from_kuid(&init_user_ns, current_uid());
+ pud->pud_gid = from_kgid(&init_user_ns, current_gid());
+ pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid());
+ pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid());
+ pud->pud_cap = cfs_curproc_cap_pack();
+ pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+ task_lock(current);
+ if (pud->pud_ngroups > current_ngroups)
+ pud->pud_ngroups = current_ngroups;
+ memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+ pud->pud_ngroups * sizeof(__u32));
+ task_unlock(current);
+
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+ struct ptlrpc_user_desc *pud;
+ int i;
+
+ pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+ if (!pud)
+ return -EINVAL;
+
+ if (swabbed) {
+ __swab32s(&pud->pud_uid);
+ __swab32s(&pud->pud_gid);
+ __swab32s(&pud->pud_fsuid);
+ __swab32s(&pud->pud_fsgid);
+ __swab32s(&pud->pud_cap);
+ __swab32s(&pud->pud_ngroups);
+ }
+
+ if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+ CERROR("%u groups is too large\n", pud->pud_ngroups);
+ return -EINVAL;
+ }
+
+ if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+ msg->lm_buflens[offset]) {
+ CERROR("%u groups are claimed but bufsize only %u\n",
+ pud->pud_ngroups, msg->lm_buflens[offset]);
+ return -EINVAL;
+ }
+
+ if (swabbed) {
+ for (i = 0; i < pud->pud_ngroups; i++)
+ __swab32s(&pud->pud_groups[i]);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/****************************************
+ * misc helpers *
+ ****************************************/
+
+const char *sec2target_str(struct ptlrpc_sec *sec)
+{
+ if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+ return "*";
+ if (sec_is_reverse(sec))
+ return "c";
+ return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+ switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+ case SPTLRPC_BULK_SVC_INTG:
+ case SPTLRPC_BULK_SVC_PRIV:
+ return 1;
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/****************************************
+ * crypto API helper/alloc blkciper *
+ ****************************************/
+
+/****************************************
+ * initialize/finalize *
+ ****************************************/
+
+int sptlrpc_init(void)
+{
+ int rc;
+
+ rwlock_init(&policy_lock);
+
+ rc = sptlrpc_gc_init();
+ if (rc)
+ goto out;
+
+ rc = sptlrpc_conf_init();
+ if (rc)
+ goto out_gc;
+
+ rc = sptlrpc_enc_pool_init();
+ if (rc)
+ goto out_conf;
+
+ rc = sptlrpc_null_init();
+ if (rc)
+ goto out_pool;
+
+ rc = sptlrpc_plain_init();
+ if (rc)
+ goto out_null;
+
+ rc = sptlrpc_lproc_init();
+ if (rc)
+ goto out_plain;
+
+ return 0;
+
+out_plain:
+ sptlrpc_plain_fini();
+out_null:
+ sptlrpc_null_fini();
+out_pool:
+ sptlrpc_enc_pool_fini();
+out_conf:
+ sptlrpc_conf_fini();
+out_gc:
+ sptlrpc_gc_fini();
+out:
+ return rc;
+}
+
+void sptlrpc_fini(void)
+{
+ sptlrpc_lproc_fini();
+ sptlrpc_plain_fini();
+ sptlrpc_null_fini();
+ sptlrpc_enc_pool_fini();
+ sptlrpc_conf_fini();
+ sptlrpc_gc_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
new file mode 100644
index 000000000..c05a8554d
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
@@ -0,0 +1,884 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+
+#include "../include/obd.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+/****************************************
+ * bulk encryption page pools *
+ ****************************************/
+
+
+#define POINTERS_PER_PAGE (PAGE_CACHE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL (POINTERS_PER_PAGE)
+
+#define IDLE_IDX_MAX (100)
+#define IDLE_IDX_WEIGHT (3)
+
+#define CACHE_QUIESCENT_PERIOD (20)
+
+static struct ptlrpc_enc_page_pool {
+ /*
+ * constants
+ */
+ unsigned long epp_max_pages; /* maximum pages can hold, const */
+ unsigned int epp_max_pools; /* number of pools, const */
+
+ /*
+ * wait queue in case of not enough free pages.
+ */
+ wait_queue_head_t epp_waitq; /* waiting threads */
+ unsigned int epp_waitqlen; /* wait queue length */
+ unsigned long epp_pages_short; /* # of pages wanted of in-q users */
+ unsigned int epp_growing:1; /* during adding pages */
+
+ /*
+ * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+ * this is counted based on each time when getting pages from
+ * the pools, not based on time. which means in case that system
+ * is idled for a while but the idle_idx might still be low if no
+ * activities happened in the pools.
+ */
+ unsigned long epp_idle_idx;
+
+ /* last shrink time due to mem tight */
+ long epp_last_shrink;
+ long epp_last_access;
+
+ /*
+ * in-pool pages bookkeeping
+ */
+ spinlock_t epp_lock; /* protect following fields */
+ unsigned long epp_total_pages; /* total pages in pools */
+ unsigned long epp_free_pages; /* current pages available */
+
+ /*
+ * statistics
+ */
+ unsigned long epp_st_max_pages; /* # of pages ever reached */
+ unsigned int epp_st_grows; /* # of grows */
+ unsigned int epp_st_grow_fails; /* # of add pages failures */
+ unsigned int epp_st_shrinks; /* # of shrinks */
+ unsigned long epp_st_access; /* # of access */
+ unsigned long epp_st_missings; /* # of cache missing */
+ unsigned long epp_st_lowfree; /* lowest free pages reached */
+ unsigned int epp_st_max_wqlen; /* highest waitqueue length */
+ unsigned long epp_st_max_wait; /* in jiffies */
+ /*
+ * pointers to pools
+ */
+ struct page ***epp_pools;
+} page_pools;
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
+{
+ spin_lock(&page_pools.epp_lock);
+
+ seq_printf(m,
+ "physical pages: %lu\n"
+ "pages per pool: %lu\n"
+ "max pages: %lu\n"
+ "max pools: %u\n"
+ "total pages: %lu\n"
+ "total free: %lu\n"
+ "idle index: %lu/100\n"
+ "last shrink: %lds\n"
+ "last access: %lds\n"
+ "max pages reached: %lu\n"
+ "grows: %u\n"
+ "grows failure: %u\n"
+ "shrinks: %u\n"
+ "cache access: %lu\n"
+ "cache missing: %lu\n"
+ "low free mark: %lu\n"
+ "max waitqueue depth: %u\n"
+ "max wait time: " CFS_TIME_T "/%u\n",
+ totalram_pages,
+ PAGES_PER_POOL,
+ page_pools.epp_max_pages,
+ page_pools.epp_max_pools,
+ page_pools.epp_total_pages,
+ page_pools.epp_free_pages,
+ page_pools.epp_idle_idx,
+ get_seconds() - page_pools.epp_last_shrink,
+ get_seconds() - page_pools.epp_last_access,
+ page_pools.epp_st_max_pages,
+ page_pools.epp_st_grows,
+ page_pools.epp_st_grow_fails,
+ page_pools.epp_st_shrinks,
+ page_pools.epp_st_access,
+ page_pools.epp_st_missings,
+ page_pools.epp_st_lowfree,
+ page_pools.epp_st_max_wqlen,
+ page_pools.epp_st_max_wait,
+ HZ);
+
+ spin_unlock(&page_pools.epp_lock);
+
+ return 0;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+ int p_idx, g_idx;
+ int p_idx_max1, p_idx_max2;
+
+ LASSERT(npages > 0);
+ LASSERT(npages <= page_pools.epp_free_pages);
+ LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+ /* max pool index before the release */
+ p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+ page_pools.epp_free_pages -= npages;
+ page_pools.epp_total_pages -= npages;
+
+ /* max pool index after the release */
+ p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+ ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+ p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+ g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+ LASSERT(page_pools.epp_pools[p_idx]);
+
+ while (npages--) {
+ LASSERT(page_pools.epp_pools[p_idx]);
+ LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+ __free_page(page_pools.epp_pools[p_idx][g_idx]);
+ page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+ if (++g_idx == PAGES_PER_POOL) {
+ p_idx++;
+ g_idx = 0;
+ }
+ }
+
+ /* free unused pools */
+ while (p_idx_max1 < p_idx_max2) {
+ LASSERT(page_pools.epp_pools[p_idx_max2]);
+ OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE);
+ page_pools.epp_pools[p_idx_max2] = NULL;
+ p_idx_max2--;
+ }
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_count(struct shrinker *s,
+ struct shrink_control *sc)
+{
+ /*
+ * if no pool access for a long time, we consider it's fully idle.
+ * a little race here is fine.
+ */
+ if (unlikely(get_seconds() - page_pools.epp_last_access >
+ CACHE_QUIESCENT_PERIOD)) {
+ spin_lock(&page_pools.epp_lock);
+ page_pools.epp_idle_idx = IDLE_IDX_MAX;
+ spin_unlock(&page_pools.epp_lock);
+ }
+
+ LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+ return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) *
+ (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+/*
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static unsigned long enc_pools_shrink_scan(struct shrinker *s,
+ struct shrink_control *sc)
+{
+ spin_lock(&page_pools.epp_lock);
+ sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan,
+ page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES);
+ if (sc->nr_to_scan > 0) {
+ enc_pools_release_free_pages(sc->nr_to_scan);
+ CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+ (long)sc->nr_to_scan, page_pools.epp_free_pages);
+
+ page_pools.epp_st_shrinks++;
+ page_pools.epp_last_shrink = get_seconds();
+ }
+ spin_unlock(&page_pools.epp_lock);
+
+ /*
+ * if no pool access for a long time, we consider it's fully idle.
+ * a little race here is fine.
+ */
+ if (unlikely(get_seconds() - page_pools.epp_last_access >
+ CACHE_QUIESCENT_PERIOD)) {
+ spin_lock(&page_pools.epp_lock);
+ page_pools.epp_idle_idx = IDLE_IDX_MAX;
+ spin_unlock(&page_pools.epp_lock);
+ }
+
+ LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+ return sc->nr_to_scan;
+}
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+ return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+ unsigned long cleaned = 0;
+ int i, j;
+
+ for (i = 0; i < npools; i++) {
+ if (pools[i]) {
+ for (j = 0; j < PAGES_PER_POOL; j++) {
+ if (pools[i][j]) {
+ __free_page(pools[i][j]);
+ cleaned++;
+ }
+ }
+ OBD_FREE(pools[i], PAGE_CACHE_SIZE);
+ pools[i] = NULL;
+ }
+ }
+
+ return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+ int freeslot;
+ int op_idx, np_idx, og_idx, ng_idx;
+ int cur_npools, end_npools;
+
+ LASSERT(npages > 0);
+ LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+ LASSERT(npages_to_npools(npages) == npools);
+ LASSERT(page_pools.epp_growing);
+
+ spin_lock(&page_pools.epp_lock);
+
+ /*
+ * (1) fill all the free slots of current pools.
+ */
+ /* free slots are those left by rent pages, and the extra ones with
+ * index >= total_pages, locate at the tail of last pool. */
+ freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+ if (freeslot != 0)
+ freeslot = PAGES_PER_POOL - freeslot;
+ freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+ op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+ og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+ np_idx = npools - 1;
+ ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+ while (freeslot) {
+ LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+ LASSERT(pools[np_idx][ng_idx] != NULL);
+
+ page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+ pools[np_idx][ng_idx] = NULL;
+
+ freeslot--;
+
+ if (++og_idx == PAGES_PER_POOL) {
+ op_idx++;
+ og_idx = 0;
+ }
+ if (--ng_idx < 0) {
+ if (np_idx == 0)
+ break;
+ np_idx--;
+ ng_idx = PAGES_PER_POOL - 1;
+ }
+ }
+
+ /*
+ * (2) add pools if needed.
+ */
+ cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+ PAGES_PER_POOL;
+ end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL - 1)
+ / PAGES_PER_POOL;
+ LASSERT(end_npools <= page_pools.epp_max_pools);
+
+ np_idx = 0;
+ while (cur_npools < end_npools) {
+ LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+ LASSERT(np_idx < npools);
+ LASSERT(pools[np_idx] != NULL);
+
+ page_pools.epp_pools[cur_npools++] = pools[np_idx];
+ pools[np_idx++] = NULL;
+ }
+
+ page_pools.epp_total_pages += npages;
+ page_pools.epp_free_pages += npages;
+ page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+ if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+ page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+ CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+ page_pools.epp_total_pages);
+
+ spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+ static DEFINE_MUTEX(add_pages_mutex);
+ struct page ***pools;
+ int npools, alloced = 0;
+ int i, j, rc = -ENOMEM;
+
+ if (npages < PTLRPC_MAX_BRW_PAGES)
+ npages = PTLRPC_MAX_BRW_PAGES;
+
+ mutex_lock(&add_pages_mutex);
+
+ if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+ npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+ LASSERT(npages > 0);
+
+ page_pools.epp_st_grows++;
+
+ npools = npages_to_npools(npages);
+ OBD_ALLOC(pools, npools * sizeof(*pools));
+ if (pools == NULL)
+ goto out;
+
+ for (i = 0; i < npools; i++) {
+ OBD_ALLOC(pools[i], PAGE_CACHE_SIZE);
+ if (pools[i] == NULL)
+ goto out_pools;
+
+ for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+ pools[i][j] = alloc_page(GFP_NOFS |
+ __GFP_HIGHMEM);
+ if (pools[i][j] == NULL)
+ goto out_pools;
+
+ alloced++;
+ }
+ }
+ LASSERT(alloced == npages);
+
+ enc_pools_insert(pools, npools, npages);
+ CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+ rc = 0;
+
+out_pools:
+ enc_pools_cleanup(pools, npools);
+ OBD_FREE(pools, npools * sizeof(*pools));
+out:
+ if (rc) {
+ page_pools.epp_st_grow_fails++;
+ CERROR("Failed to allocate %d enc pages\n", npages);
+ }
+
+ mutex_unlock(&add_pages_mutex);
+ return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+ assert_spin_locked(&page_pools.epp_lock);
+ LASSERT(page_pools.epp_waitqlen >= 0);
+
+ if (unlikely(page_pools.epp_waitqlen)) {
+ LASSERT(waitqueue_active(&page_pools.epp_waitq));
+ wake_up_all(&page_pools.epp_waitq);
+ }
+}
+
+static int enc_pools_should_grow(int page_needed, long now)
+{
+ /* don't grow if someone else is growing the pools right now,
+ * or the pools has reached its full capacity
+ */
+ if (page_pools.epp_growing ||
+ page_pools.epp_total_pages == page_pools.epp_max_pages)
+ return 0;
+
+ /* if total pages is not enough, we need to grow */
+ if (page_pools.epp_total_pages < page_needed)
+ return 1;
+
+ /*
+ * we wanted to return 0 here if there was a shrink just happened
+ * moment ago, but this may cause deadlock if both client and ost
+ * live on single node.
+ */
+#if 0
+ if (now - page_pools.epp_last_shrink < 2)
+ return 0;
+#endif
+
+ /*
+ * here we perhaps need consider other factors like wait queue
+ * length, idle index, etc. ?
+ */
+
+ /* grow the pools in any other cases */
+ return 1;
+}
+
+/*
+ * we allocate the requested pages atomically.
+ */
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+ wait_queue_t waitlink;
+ unsigned long this_idle = -1;
+ unsigned long tick = 0;
+ long now;
+ int p_idx, g_idx;
+ int i;
+
+ LASSERT(desc->bd_iov_count > 0);
+ LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+ /* resent bulk, enc iov might have been allocated previously */
+ if (desc->bd_enc_iov != NULL)
+ return 0;
+
+ OBD_ALLOC(desc->bd_enc_iov,
+ desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+ if (desc->bd_enc_iov == NULL)
+ return -ENOMEM;
+
+ spin_lock(&page_pools.epp_lock);
+
+ page_pools.epp_st_access++;
+again:
+ if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
+ if (tick == 0)
+ tick = cfs_time_current();
+
+ now = get_seconds();
+
+ page_pools.epp_st_missings++;
+ page_pools.epp_pages_short += desc->bd_iov_count;
+
+ if (enc_pools_should_grow(desc->bd_iov_count, now)) {
+ page_pools.epp_growing = 1;
+
+ spin_unlock(&page_pools.epp_lock);
+ enc_pools_add_pages(page_pools.epp_pages_short / 2);
+ spin_lock(&page_pools.epp_lock);
+
+ page_pools.epp_growing = 0;
+
+ enc_pools_wakeup();
+ } else {
+ if (++page_pools.epp_waitqlen >
+ page_pools.epp_st_max_wqlen)
+ page_pools.epp_st_max_wqlen =
+ page_pools.epp_waitqlen;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ init_waitqueue_entry(&waitlink, current);
+ add_wait_queue(&page_pools.epp_waitq, &waitlink);
+
+ spin_unlock(&page_pools.epp_lock);
+ schedule();
+ remove_wait_queue(&page_pools.epp_waitq, &waitlink);
+ LASSERT(page_pools.epp_waitqlen > 0);
+ spin_lock(&page_pools.epp_lock);
+ page_pools.epp_waitqlen--;
+ }
+
+ LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+ page_pools.epp_pages_short -= desc->bd_iov_count;
+
+ this_idle = 0;
+ goto again;
+ }
+
+ /* record max wait time */
+ if (unlikely(tick != 0)) {
+ tick = cfs_time_current() - tick;
+ if (tick > page_pools.epp_st_max_wait)
+ page_pools.epp_st_max_wait = tick;
+ }
+
+ /* proceed with rest of allocation */
+ page_pools.epp_free_pages -= desc->bd_iov_count;
+
+ p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+ g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+ for (i = 0; i < desc->bd_iov_count; i++) {
+ LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+ desc->bd_enc_iov[i].kiov_page =
+ page_pools.epp_pools[p_idx][g_idx];
+ page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+ if (++g_idx == PAGES_PER_POOL) {
+ p_idx++;
+ g_idx = 0;
+ }
+ }
+
+ if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+ page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+ /*
+ * new idle index = (old * weight + new) / (weight + 1)
+ */
+ if (this_idle == -1) {
+ this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+ page_pools.epp_total_pages;
+ }
+ page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+ this_idle) /
+ (IDLE_IDX_WEIGHT + 1);
+
+ page_pools.epp_last_access = get_seconds();
+
+ spin_unlock(&page_pools.epp_lock);
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+ int p_idx, g_idx;
+ int i;
+
+ if (desc->bd_enc_iov == NULL)
+ return;
+
+ LASSERT(desc->bd_iov_count > 0);
+
+ spin_lock(&page_pools.epp_lock);
+
+ p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+ g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+ LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
+ page_pools.epp_total_pages);
+ LASSERT(page_pools.epp_pools[p_idx]);
+
+ for (i = 0; i < desc->bd_iov_count; i++) {
+ LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
+ LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
+ LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
+
+ page_pools.epp_pools[p_idx][g_idx] =
+ desc->bd_enc_iov[i].kiov_page;
+
+ if (++g_idx == PAGES_PER_POOL) {
+ p_idx++;
+ g_idx = 0;
+ }
+ }
+
+ page_pools.epp_free_pages += desc->bd_iov_count;
+
+ enc_pools_wakeup();
+
+ spin_unlock(&page_pools.epp_lock);
+
+ OBD_FREE(desc->bd_enc_iov,
+ desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+ desc->bd_enc_iov = NULL;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+ int need_grow = 0;
+
+ spin_lock(&page_pools.epp_lock);
+ if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+ page_pools.epp_growing = 1;
+ need_grow = 1;
+ }
+ spin_unlock(&page_pools.epp_lock);
+
+ if (need_grow) {
+ enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+ PTLRPC_MAX_BRW_PAGES);
+
+ spin_lock(&page_pools.epp_lock);
+ page_pools.epp_growing = 0;
+ enc_pools_wakeup();
+ spin_unlock(&page_pools.epp_lock);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+ LASSERT(page_pools.epp_max_pools);
+ OBD_ALLOC_LARGE(page_pools.epp_pools,
+ page_pools.epp_max_pools *
+ sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+ LASSERT(page_pools.epp_max_pools);
+ LASSERT(page_pools.epp_pools);
+
+ OBD_FREE_LARGE(page_pools.epp_pools,
+ page_pools.epp_max_pools *
+ sizeof(*page_pools.epp_pools));
+}
+
+static struct shrinker pools_shrinker = {
+ .count_objects = enc_pools_shrink_count,
+ .scan_objects = enc_pools_shrink_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+int sptlrpc_enc_pool_init(void)
+{
+ /*
+ * maximum capacity is 1/8 of total physical memory.
+ * is the 1/8 a good number?
+ */
+ page_pools.epp_max_pages = totalram_pages / 8;
+ page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+ init_waitqueue_head(&page_pools.epp_waitq);
+ page_pools.epp_waitqlen = 0;
+ page_pools.epp_pages_short = 0;
+
+ page_pools.epp_growing = 0;
+
+ page_pools.epp_idle_idx = 0;
+ page_pools.epp_last_shrink = get_seconds();
+ page_pools.epp_last_access = get_seconds();
+
+ spin_lock_init(&page_pools.epp_lock);
+ page_pools.epp_total_pages = 0;
+ page_pools.epp_free_pages = 0;
+
+ page_pools.epp_st_max_pages = 0;
+ page_pools.epp_st_grows = 0;
+ page_pools.epp_st_grow_fails = 0;
+ page_pools.epp_st_shrinks = 0;
+ page_pools.epp_st_access = 0;
+ page_pools.epp_st_missings = 0;
+ page_pools.epp_st_lowfree = 0;
+ page_pools.epp_st_max_wqlen = 0;
+ page_pools.epp_st_max_wait = 0;
+
+ enc_pools_alloc();
+ if (page_pools.epp_pools == NULL)
+ return -ENOMEM;
+
+ register_shrinker(&pools_shrinker);
+
+ return 0;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+ unsigned long cleaned, npools;
+
+ LASSERT(page_pools.epp_pools);
+ LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+ unregister_shrinker(&pools_shrinker);
+
+ npools = npages_to_npools(page_pools.epp_total_pages);
+ cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+ LASSERT(cleaned == page_pools.epp_total_pages);
+
+ enc_pools_free();
+
+ if (page_pools.epp_st_access > 0) {
+ CDEBUG(D_SEC,
+ "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait "
+ CFS_TIME_T"/%d\n",
+ page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+ page_pools.epp_st_grow_fails,
+ page_pools.epp_st_shrinks, page_pools.epp_st_access,
+ page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+ page_pools.epp_st_max_wait, HZ);
+ }
+}
+
+
+static int cfs_hash_alg_id[] = {
+ [BULK_HASH_ALG_NULL] = CFS_HASH_ALG_NULL,
+ [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32,
+ [BULK_HASH_ALG_CRC32] = CFS_HASH_ALG_CRC32,
+ [BULK_HASH_ALG_MD5] = CFS_HASH_ALG_MD5,
+ [BULK_HASH_ALG_SHA1] = CFS_HASH_ALG_SHA1,
+ [BULK_HASH_ALG_SHA256] = CFS_HASH_ALG_SHA256,
+ [BULK_HASH_ALG_SHA384] = CFS_HASH_ALG_SHA384,
+ [BULK_HASH_ALG_SHA512] = CFS_HASH_ALG_SHA512,
+};
+const char *sptlrpc_get_hash_name(__u8 hash_alg)
+{
+ return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_name);
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+ return cfs_crypto_hash_alg(algname);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_alg);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+ struct ptlrpc_bulk_sec_desc *bsd;
+ int size = msg->lm_buflens[offset];
+
+ bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+ if (bsd == NULL) {
+ CERROR("Invalid bulk sec desc: size %d\n", size);
+ return -EINVAL;
+ }
+
+ if (swabbed)
+ __swab32s(&bsd->bsd_nob);
+
+ if (unlikely(bsd->bsd_version != 0)) {
+ CERROR("Unexpected version %u\n", bsd->bsd_version);
+ return -EPROTO;
+ }
+
+ if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+ CERROR("Invalid type %u\n", bsd->bsd_type);
+ return -EPROTO;
+ }
+
+ /* FIXME more sanity check here */
+
+ if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+ bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+ bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+ CERROR("Invalid svc %u\n", bsd->bsd_svc);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+ void *buf, int buflen)
+{
+ struct cfs_crypto_hash_desc *hdesc;
+ int hashsize;
+ char hashbuf[64];
+ unsigned int bufsize;
+ int i, err;
+
+ LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+ LASSERT(buflen >= 4);
+
+ hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+ if (IS_ERR(hdesc)) {
+ CERROR("Unable to initialize checksum hash %s\n",
+ cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+ return PTR_ERR(hdesc);
+ }
+
+ hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+ for (i = 0; i < desc->bd_iov_count; i++) {
+ cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
+ desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
+ desc->bd_iov[i].kiov_len);
+ }
+ if (hashsize > buflen) {
+ bufsize = sizeof(hashbuf);
+ err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf,
+ &bufsize);
+ memcpy(buf, hashbuf, buflen);
+ } else {
+ bufsize = buflen;
+ err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf,
+ &bufsize);
+ }
+
+ if (err)
+ cfs_crypto_hash_final(hdesc, NULL, NULL);
+ return err;
+}
+EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
new file mode 100644
index 000000000..56ba9e4e5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
@@ -0,0 +1,901 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include "../include/obd.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_param.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+ switch (part) {
+ case LUSTRE_SP_CLI:
+ return "cli";
+ case LUSTRE_SP_MDT:
+ return "mdt";
+ case LUSTRE_SP_OST:
+ return "ost";
+ case LUSTRE_SP_MGC:
+ return "mgc";
+ case LUSTRE_SP_MGS:
+ return "mgs";
+ case LUSTRE_SP_ANY:
+ return "any";
+ default:
+ return "err";
+ }
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+ const char *type = obd->obd_type->typ_name;
+
+ if (!strcmp(type, LUSTRE_MDT_NAME))
+ return LUSTRE_SP_MDT;
+ if (!strcmp(type, LUSTRE_OST_NAME))
+ return LUSTRE_SP_OST;
+ if (!strcmp(type, LUSTRE_MGS_NAME))
+ return LUSTRE_SP_MGS;
+
+ CERROR("unknown target %p(%s)\n", obd, type);
+ return LUSTRE_SP_ANY;
+}
+EXPORT_SYMBOL(sptlrpc_target_sec_part);
+
+/****************************************
+ * user supplied flavor string parsing *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+ char buf[32];
+ char *bulk, *alg;
+
+ memset(flvr, 0, sizeof(*flvr));
+
+ if (str == NULL || str[0] == '\0') {
+ flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+ return 0;
+ }
+
+ strncpy(buf, str, sizeof(buf));
+ buf[sizeof(buf) - 1] = '\0';
+
+ bulk = strchr(buf, '-');
+ if (bulk)
+ *bulk++ = '\0';
+
+ flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+ if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+ goto err_out;
+
+ /*
+ * currently only base flavor "plain" can have bulk specification.
+ */
+ if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+ flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+ if (bulk) {
+ /*
+ * format: plain-hash:<hash_alg>
+ */
+ alg = strchr(bulk, ':');
+ if (alg == NULL)
+ goto err_out;
+ *alg++ = '\0';
+
+ if (strcmp(bulk, "hash"))
+ goto err_out;
+
+ flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+ if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+ goto err_out;
+ }
+
+ if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+ flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+ else
+ flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+ } else {
+ if (bulk)
+ goto err_out;
+ }
+
+ flvr->sf_flags = 0;
+ return 0;
+
+err_out:
+ CERROR("invalid flavor string: %s\n", str);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+ memset(sf, 0, sizeof(*sf));
+
+ sf->sf_rpc = SPTLRPC_FLVR_NULL;
+ sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+ rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
+ rule->sr_from = LUSTRE_SP_ANY;
+ rule->sr_to = LUSTRE_SP_ANY;
+ rule->sr_padding = 0;
+
+ get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+ char *flavor, *dir;
+ int rc;
+
+ sptlrpc_rule_init(rule);
+
+ flavor = strchr(param, '=');
+ if (flavor == NULL) {
+ CERROR("invalid param, no '='\n");
+ return -EINVAL;
+ }
+ *flavor++ = '\0';
+
+ dir = strchr(param, '.');
+ if (dir)
+ *dir++ = '\0';
+
+ /* 1.1 network */
+ if (strcmp(param, "default")) {
+ rule->sr_netid = libcfs_str2net(param);
+ if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) {
+ CERROR("invalid network name: %s\n", param);
+ return -EINVAL;
+ }
+ }
+
+ /* 1.2 direction */
+ if (dir) {
+ if (!strcmp(dir, "mdt2ost")) {
+ rule->sr_from = LUSTRE_SP_MDT;
+ rule->sr_to = LUSTRE_SP_OST;
+ } else if (!strcmp(dir, "mdt2mdt")) {
+ rule->sr_from = LUSTRE_SP_MDT;
+ rule->sr_to = LUSTRE_SP_MDT;
+ } else if (!strcmp(dir, "cli2ost")) {
+ rule->sr_from = LUSTRE_SP_CLI;
+ rule->sr_to = LUSTRE_SP_OST;
+ } else if (!strcmp(dir, "cli2mdt")) {
+ rule->sr_from = LUSTRE_SP_CLI;
+ rule->sr_to = LUSTRE_SP_MDT;
+ } else {
+ CERROR("invalid rule dir segment: %s\n", dir);
+ return -EINVAL;
+ }
+ }
+
+ /* 2.1 flavor */
+ rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+ if (rc)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+ LASSERT(rset->srs_nslot ||
+ (rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+ if (rset->srs_nslot) {
+ OBD_FREE(rset->srs_rules,
+ rset->srs_nslot * sizeof(*rset->srs_rules));
+ sptlrpc_rule_set_init(rset);
+ }
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accommodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+ struct sptlrpc_rule *rules;
+ int nslot;
+
+ might_sleep();
+
+ if (rset->srs_nrule < rset->srs_nslot)
+ return 0;
+
+ nslot = rset->srs_nslot + 8;
+
+ /* better use realloc() if available */
+ OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules));
+ if (rules == NULL)
+ return -ENOMEM;
+
+ if (rset->srs_nrule) {
+ LASSERT(rset->srs_nslot && rset->srs_rules);
+ memcpy(rules, rset->srs_rules,
+ rset->srs_nrule * sizeof(*rset->srs_rules));
+
+ OBD_FREE(rset->srs_rules,
+ rset->srs_nslot * sizeof(*rset->srs_rules));
+ }
+
+ rset->srs_rules = rules;
+ rset->srs_nslot = nslot;
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_expand);
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+ return (rule->sr_from != LUSTRE_SP_ANY ||
+ rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+ return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY));
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+ struct sptlrpc_rule *r2)
+{
+ return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+ struct sptlrpc_rule *r2)
+{
+ return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset,
+ struct sptlrpc_rule *rule)
+{
+ struct sptlrpc_rule *p = rset->srs_rules;
+ int spec_dir, spec_net;
+ int rc, n, match = 0;
+
+ might_sleep();
+
+ spec_net = rule_spec_net(rule);
+ spec_dir = rule_spec_dir(rule);
+
+ for (n = 0; n < rset->srs_nrule; n++) {
+ p = &rset->srs_rules[n];
+
+ /* test network match, if failed:
+ * - spec rule: skip rules which is also spec rule match, until
+ * we hit a wild rule, which means no more chance
+ * - wild rule: skip until reach the one which is also wild
+ * and matches
+ */
+ if (!rule_match_net(p, rule)) {
+ if (spec_net) {
+ if (rule_spec_net(p))
+ continue;
+ else
+ break;
+ } else {
+ continue;
+ }
+ }
+
+ /* test dir match, same logic as net matching */
+ if (!rule_match_dir(p, rule)) {
+ if (spec_dir) {
+ if (rule_spec_dir(p))
+ continue;
+ else
+ break;
+ } else {
+ continue;
+ }
+ }
+
+ /* find a match */
+ match = 1;
+ break;
+ }
+
+ if (match) {
+ LASSERT(n >= 0 && n < rset->srs_nrule);
+
+ if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+ /* remove this rule */
+ if (n < rset->srs_nrule - 1)
+ memmove(&rset->srs_rules[n],
+ &rset->srs_rules[n + 1],
+ (rset->srs_nrule - n - 1) *
+ sizeof(*rule));
+ rset->srs_nrule--;
+ } else {
+ /* override the rule */
+ memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+ }
+ } else {
+ LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+ if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+ rc = sptlrpc_rule_set_expand(rset);
+ if (rc)
+ return rc;
+
+ if (n < rset->srs_nrule)
+ memmove(&rset->srs_rules[n + 1],
+ &rset->srs_rules[n],
+ (rset->srs_nrule - n) * sizeof(*rule));
+ memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+ rset->srs_nrule++;
+ } else {
+ CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+ enum lustre_sec_part from,
+ enum lustre_sec_part to,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *sf)
+{
+ struct sptlrpc_rule *r;
+ int n;
+
+ for (n = 0; n < rset->srs_nrule; n++) {
+ r = &rset->srs_rules[n];
+
+ if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) &&
+ r->sr_netid != LNET_NIDNET(LNET_NID_ANY) &&
+ LNET_NIDNET(nid) != r->sr_netid)
+ continue;
+
+ if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+ from != r->sr_from)
+ continue;
+
+ if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+ to != r->sr_to)
+ continue;
+
+ *sf = r->sr_flvr;
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_choose);
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+ struct sptlrpc_rule *r;
+ int n;
+
+ for (n = 0; n < rset->srs_nrule; n++) {
+ r = &rset->srs_rules[n];
+ CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+ r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+ }
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_dump);
+
+/**********************************
+ * sptlrpc configuration support *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+ struct list_head sct_list;
+ char sct_name[MAX_OBD_NAME];
+ struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+ struct list_head sc_list;
+ char sc_fsname[MTI_NAME_MAXLEN];
+ unsigned int sc_modified; /* modified during updating */
+ unsigned int sc_updated:1, /* updated copy from MGS */
+ sc_local:1; /* local copy from target */
+ struct sptlrpc_rule_set sc_rset; /* fs general rules */
+ struct list_head sc_tgts; /* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static LIST_HEAD(sptlrpc_confs);
+
+static inline int is_hex(char c)
+{
+ return ((c >= '0' && c <= '9') ||
+ (c >= 'a' && c <= 'f'));
+}
+
+static void target2fsname(const char *tgt, char *fsname, int buflen)
+{
+ const char *ptr;
+ int len;
+
+ ptr = strrchr(tgt, '-');
+ if (ptr) {
+ if ((strncmp(ptr, "-MDT", 4) != 0 &&
+ strncmp(ptr, "-OST", 4) != 0) ||
+ !is_hex(ptr[4]) || !is_hex(ptr[5]) ||
+ !is_hex(ptr[6]) || !is_hex(ptr[7]))
+ ptr = NULL;
+ }
+
+ /* if we didn't find the pattern, treat the whole string as fsname */
+ if (ptr == NULL)
+ len = strlen(tgt);
+ else
+ len = ptr - tgt;
+
+ len = min(len, buflen - 1);
+ memcpy(fsname, tgt, len);
+ fsname[len] = '\0';
+}
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+ struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+ sptlrpc_rule_set_free(&conf->sc_rset);
+
+ list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+ &conf->sc_tgts, sct_list) {
+ sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+ list_del(&conf_tgt->sct_list);
+ OBD_FREE_PTR(conf_tgt);
+ }
+ LASSERT(list_empty(&conf->sc_tgts));
+
+ conf->sc_updated = 0;
+ conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+ CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+ sptlrpc_conf_free_rsets(conf);
+ list_del(&conf->sc_list);
+ OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+ const char *name,
+ int create)
+{
+ struct sptlrpc_conf_tgt *conf_tgt;
+
+ list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+ if (strcmp(conf_tgt->sct_name, name) == 0)
+ return conf_tgt;
+ }
+
+ if (!create)
+ return NULL;
+
+ OBD_ALLOC_PTR(conf_tgt);
+ if (conf_tgt) {
+ strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+ sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+ list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+ }
+
+ return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+ int create)
+{
+ struct sptlrpc_conf *conf;
+
+ list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+ if (strcmp(conf->sc_fsname, fsname) == 0)
+ return conf;
+ }
+
+ if (!create)
+ return NULL;
+
+ OBD_ALLOC_PTR(conf);
+ if (conf == NULL)
+ return NULL;
+
+ strcpy(conf->sc_fsname, fsname);
+ sptlrpc_rule_set_init(&conf->sc_rset);
+ INIT_LIST_HEAD(&conf->sc_tgts);
+ list_add(&conf->sc_list, &sptlrpc_confs);
+
+ CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+ return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+ const char *target,
+ struct sptlrpc_rule *rule)
+{
+ struct sptlrpc_conf_tgt *conf_tgt;
+ struct sptlrpc_rule_set *rule_set;
+
+ /* fsname == target means general rules for the whole fs */
+ if (strcmp(conf->sc_fsname, target) == 0) {
+ rule_set = &conf->sc_rset;
+ } else {
+ conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+ if (conf_tgt) {
+ rule_set = &conf_tgt->sct_rset;
+ } else {
+ CERROR("out of memory, can't merge rule!\n");
+ return -ENOMEM;
+ }
+ }
+
+ return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
+ struct sptlrpc_conf *conf)
+{
+ char *target, *param;
+ char fsname[MTI_NAME_MAXLEN];
+ struct sptlrpc_rule rule;
+ int rc;
+
+ target = lustre_cfg_string(lcfg, 1);
+ if (target == NULL) {
+ CERROR("missing target name\n");
+ return -EINVAL;
+ }
+
+ param = lustre_cfg_string(lcfg, 2);
+ if (param == NULL) {
+ CERROR("missing parameter\n");
+ return -EINVAL;
+ }
+
+ CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+ /* parse rule to make sure the format is correct */
+ if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+ CERROR("Invalid sptlrpc parameter: %s\n", param);
+ return -EINVAL;
+ }
+ param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+ rc = sptlrpc_parse_rule(param, &rule);
+ if (rc)
+ return -EINVAL;
+
+ if (conf == NULL) {
+ target2fsname(target, fsname, sizeof(fsname));
+
+ mutex_lock(&sptlrpc_conf_lock);
+ conf = sptlrpc_conf_get(fsname, 0);
+ if (conf == NULL) {
+ CERROR("can't find conf\n");
+ rc = -ENOMEM;
+ } else {
+ rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+ }
+ mutex_unlock(&sptlrpc_conf_lock);
+ } else {
+ LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+ rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+ }
+
+ if (rc == 0)
+ conf->sc_modified++;
+
+ return rc;
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+ return __sptlrpc_process_config(lcfg, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+ char *ptr;
+ int len;
+
+ ptr = strrchr(logname, '-');
+ if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+ CERROR("%s is not a sptlrpc config log\n", logname);
+ return -EINVAL;
+ }
+
+ len = min((int) (ptr - logname), buflen - 1);
+
+ memcpy(buf, logname, len);
+ buf[len] = '\0';
+ return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+ struct sptlrpc_conf *conf;
+ char fsname[16];
+
+ if (logname2fsname(logname, fsname, sizeof(fsname)))
+ return;
+
+ mutex_lock(&sptlrpc_conf_lock);
+
+ conf = sptlrpc_conf_get(fsname, 0);
+ if (conf) {
+ if (conf->sc_local) {
+ LASSERT(conf->sc_updated == 0);
+ sptlrpc_conf_free_rsets(conf);
+ }
+ conf->sc_modified = 0;
+ }
+
+ mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+ struct sptlrpc_conf *conf;
+ char fsname[16];
+
+ if (logname2fsname(logname, fsname, sizeof(fsname)))
+ return;
+
+ mutex_lock(&sptlrpc_conf_lock);
+
+ conf = sptlrpc_conf_get(fsname, 0);
+ if (conf) {
+ /*
+ * if original state is not updated, make sure the
+ * modified counter > 0 to enforce updating local copy.
+ */
+ if (conf->sc_updated == 0)
+ conf->sc_modified++;
+
+ conf->sc_updated = 1;
+ }
+
+ mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+ char fsname[16];
+
+ if (logname2fsname(logname, fsname, sizeof(fsname)))
+ return;
+
+ mutex_lock(&sptlrpc_conf_lock);
+ sptlrpc_conf_get(fsname, 1);
+ mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+ struct sptlrpc_conf *conf;
+ char fsname[16];
+
+ if (logname2fsname(logname, fsname, sizeof(fsname)))
+ return;
+
+ mutex_lock(&sptlrpc_conf_lock);
+ conf = sptlrpc_conf_get(fsname, 0);
+ if (conf)
+ sptlrpc_conf_free(conf);
+ mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static inline void flavor_set_flags(struct sptlrpc_flavor *sf,
+ enum lustre_sec_part from,
+ enum lustre_sec_part to,
+ unsigned int fl_udesc)
+{
+ /*
+ * null flavor doesn't need to set any flavor, and in fact
+ * we'd better not do that because everybody share a single sec.
+ */
+ if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+ return;
+
+ if (from == LUSTRE_SP_MDT) {
+ /* MDT->MDT; MDT->OST */
+ sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+ } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+ /* CLI->OST */
+ sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+ } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+ /* CLI->MDT */
+ if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+ sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+ }
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+ enum lustre_sec_part to,
+ struct obd_uuid *target,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *sf)
+{
+ struct sptlrpc_conf *conf;
+ struct sptlrpc_conf_tgt *conf_tgt;
+ char name[MTI_NAME_MAXLEN];
+ int len, rc = 0;
+
+ target2fsname(target->uuid, name, sizeof(name));
+
+ mutex_lock(&sptlrpc_conf_lock);
+
+ conf = sptlrpc_conf_get(name, 0);
+ if (conf == NULL)
+ goto out;
+
+ /* convert uuid name (supposed end with _UUID) to target name */
+ len = strlen(target->uuid);
+ LASSERT(len > 5);
+ memcpy(name, target->uuid, len - 5);
+ name[len - 5] = '\0';
+
+ conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+ if (conf_tgt) {
+ rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+ from, to, nid, sf);
+ if (rc)
+ goto out;
+ }
+
+ rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf);
+out:
+ mutex_unlock(&sptlrpc_conf_lock);
+
+ if (rc == 0)
+ get_default_flavor(sf);
+
+ flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+ enum lustre_sec_part from,
+ lnet_nid_t nid,
+ struct sptlrpc_flavor *sf)
+{
+ if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+ get_default_flavor(sf);
+}
+EXPORT_SYMBOL(sptlrpc_target_choose_flavor);
+
+#define SEC_ADAPT_DELAY (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+ struct obd_import *imp;
+
+ LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0);
+ CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+ /* serialize with connect/disconnect import */
+ down_read(&obd->u.cli.cl_sem);
+
+ imp = obd->u.cli.cl_import;
+ if (imp) {
+ spin_lock(&imp->imp_lock);
+ if (imp->imp_sec)
+ imp->imp_sec_expire = get_seconds() +
+ SEC_ADAPT_DELAY;
+ spin_unlock(&imp->imp_lock);
+ }
+
+ up_read(&obd->u.cli.cl_sem);
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+int sptlrpc_conf_init(void)
+{
+ mutex_init(&sptlrpc_conf_lock);
+ return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+ struct sptlrpc_conf *conf, *conf_next;
+
+ mutex_lock(&sptlrpc_conf_lock);
+ list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) {
+ sptlrpc_conf_free(conf);
+ }
+ LASSERT(list_empty(&sptlrpc_confs));
+ mutex_unlock(&sptlrpc_conf_lock);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
new file mode 100644
index 000000000..81de68edb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
@@ -0,0 +1,252 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+
+static struct mutex sec_gc_mutex;
+static LIST_HEAD(sec_gc_list);
+static spinlock_t sec_gc_list_lock;
+
+static LIST_HEAD(sec_gc_ctx_list);
+static spinlock_t sec_gc_ctx_list_lock;
+
+static struct ptlrpc_thread sec_gc_thread;
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+ LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+ LASSERT(sec->ps_gc_interval > 0);
+ LASSERT(list_empty(&sec->ps_gc_list));
+
+ sec->ps_gc_next = get_seconds() + sec->ps_gc_interval;
+
+ spin_lock(&sec_gc_list_lock);
+ list_add_tail(&sec_gc_list, &sec->ps_gc_list);
+ spin_unlock(&sec_gc_list_lock);
+
+ CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_sec);
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+ if (list_empty(&sec->ps_gc_list))
+ return;
+
+ might_sleep();
+
+ /* signal before list_del to make iteration in gc thread safe */
+ atomic_inc(&sec_gc_wait_del);
+
+ spin_lock(&sec_gc_list_lock);
+ list_del_init(&sec->ps_gc_list);
+ spin_unlock(&sec_gc_list_lock);
+
+ /* barrier */
+ mutex_lock(&sec_gc_mutex);
+ mutex_unlock(&sec_gc_mutex);
+
+ atomic_dec(&sec_gc_wait_del);
+
+ CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_del_sec);
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+ LASSERT(list_empty(&ctx->cc_gc_chain));
+
+ CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+ ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+ spin_lock(&sec_gc_ctx_list_lock);
+ list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+ spin_unlock(&sec_gc_ctx_list_lock);
+
+ thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+ wake_up(&sec_gc_thread.t_ctl_waitq);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+ struct ptlrpc_cli_ctx *ctx;
+
+ spin_lock(&sec_gc_ctx_list_lock);
+
+ while (!list_empty(&sec_gc_ctx_list)) {
+ ctx = list_entry(sec_gc_ctx_list.next,
+ struct ptlrpc_cli_ctx, cc_gc_chain);
+ list_del_init(&ctx->cc_gc_chain);
+ spin_unlock(&sec_gc_ctx_list_lock);
+
+ LASSERT(ctx->cc_sec);
+ LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+ CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+ ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+ sptlrpc_cli_ctx_put(ctx, 1);
+
+ spin_lock(&sec_gc_ctx_list_lock);
+ }
+
+ spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+ LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+ if (unlikely(sec->ps_gc_next == 0)) {
+ CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+ sec, sec->ps_policy->sp_name);
+ return;
+ }
+
+ CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+ if (cfs_time_after(sec->ps_gc_next, get_seconds()))
+ return;
+
+ sec->ps_policy->sp_cops->gc_ctx(sec);
+ sec->ps_gc_next = get_seconds() + sec->ps_gc_interval;
+}
+
+static int sec_gc_main(void *arg)
+{
+ struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+ struct l_wait_info lwi;
+
+ unshare_fs_struct();
+
+ /* Record that the thread is running */
+ thread_set_flags(thread, SVC_RUNNING);
+ wake_up(&thread->t_ctl_waitq);
+
+ while (1) {
+ struct ptlrpc_sec *sec;
+
+ thread_clear_flags(thread, SVC_SIGNAL);
+ sec_process_ctx_list();
+again:
+ /* go through sec list do gc.
+ * FIXME here we iterate through the whole list each time which
+ * is not optimal. we perhaps want to use balanced binary tree
+ * to trace each sec as order of expiry time.
+ * another issue here is we wakeup as fixed interval instead of
+ * according to each sec's expiry time */
+ mutex_lock(&sec_gc_mutex);
+ list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+ /* if someone is waiting to be deleted, let it
+ * proceed as soon as possible. */
+ if (atomic_read(&sec_gc_wait_del)) {
+ CDEBUG(D_SEC, "deletion pending, start over\n");
+ mutex_unlock(&sec_gc_mutex);
+ goto again;
+ }
+
+ sec_do_gc(sec);
+ }
+ mutex_unlock(&sec_gc_mutex);
+
+ /* check ctx list again before sleep */
+ sec_process_ctx_list();
+
+ lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL);
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_stopping(thread) ||
+ thread_is_signal(thread),
+ &lwi);
+
+ if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+ break;
+ }
+
+ thread_set_flags(thread, SVC_STOPPED);
+ wake_up(&thread->t_ctl_waitq);
+ return 0;
+}
+
+int sptlrpc_gc_init(void)
+{
+ struct l_wait_info lwi = { 0 };
+ struct task_struct *task;
+
+ mutex_init(&sec_gc_mutex);
+ spin_lock_init(&sec_gc_list_lock);
+ spin_lock_init(&sec_gc_ctx_list_lock);
+
+ /* initialize thread control */
+ memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+ init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+ task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+ if (IS_ERR(task)) {
+ CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+ return PTR_ERR(task);
+ }
+
+ l_wait_event(sec_gc_thread.t_ctl_waitq,
+ thread_is_running(&sec_gc_thread), &lwi);
+ return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+ struct l_wait_info lwi = { 0 };
+
+ thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+ wake_up(&sec_gc_thread.t_ctl_waitq);
+
+ l_wait_event(sec_gc_thread.t_ctl_waitq,
+ thread_is_stopped(&sec_gc_thread), &lwi);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
new file mode 100644
index 000000000..0d08145a6
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
@@ -0,0 +1,199 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include "../../include/linux/libcfs/libcfs.h"
+#include <linux/crypto.h>
+
+#include "../include/obd.h"
+#include "../include/obd_class.h"
+#include "../include/obd_support.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_import.h"
+#include "../include/lustre_dlm.h"
+#include "../include/lustre_sec.h"
+
+#include "ptlrpc_internal.h"
+
+
+struct proc_dir_entry *sptlrpc_proc_root = NULL;
+EXPORT_SYMBOL(sptlrpc_proc_root);
+
+static char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+ buf[0] = '\0';
+
+ if (flags & PTLRPC_SEC_FL_REVERSE)
+ strlcat(buf, "reverse,", bufsize);
+ if (flags & PTLRPC_SEC_FL_ROOTONLY)
+ strlcat(buf, "rootonly,", bufsize);
+ if (flags & PTLRPC_SEC_FL_UDESC)
+ strlcat(buf, "udesc,", bufsize);
+ if (flags & PTLRPC_SEC_FL_BULK)
+ strlcat(buf, "bulk,", bufsize);
+ if (buf[0] == '\0')
+ strlcat(buf, "-,", bufsize);
+
+ return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+ struct obd_device *dev = seq->private;
+ struct client_obd *cli = &dev->u.cli;
+ struct ptlrpc_sec *sec = NULL;
+ char str[32];
+
+ LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+ strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+ strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+ if (cli->cl_import)
+ sec = sptlrpc_import_sec_ref(cli->cl_import);
+ if (sec == NULL)
+ goto out;
+
+ sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+ seq_printf(seq, "rpc flavor: %s\n",
+ sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+ seq_printf(seq, "bulk flavor: %s\n",
+ sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+ seq_printf(seq, "flags: %s\n",
+ sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+ seq_printf(seq, "id: %d\n", sec->ps_id);
+ seq_printf(seq, "refcount: %d\n",
+ atomic_read(&sec->ps_refcount));
+ seq_printf(seq, "nctx: %d\n", atomic_read(&sec->ps_nctx));
+ seq_printf(seq, "gc internal %ld\n", sec->ps_gc_interval);
+ seq_printf(seq, "gc next %ld\n",
+ sec->ps_gc_interval ?
+ sec->ps_gc_next - get_seconds() : 0);
+
+ sptlrpc_sec_put(sec);
+out:
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+ struct obd_device *dev = seq->private;
+ struct client_obd *cli = &dev->u.cli;
+ struct ptlrpc_sec *sec = NULL;
+
+ LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+ strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+ strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+ if (cli->cl_import)
+ sec = sptlrpc_import_sec_ref(cli->cl_import);
+ if (sec == NULL)
+ goto out;
+
+ if (sec->ps_policy->sp_cops->display)
+ sec->ps_policy->sp_cops->display(sec, seq);
+
+ sptlrpc_sec_put(sec);
+out:
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{
+ int rc;
+
+ if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+ strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+ strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) {
+ CERROR("can't register lproc for obd type %s\n",
+ dev->obd_type->typ_name);
+ return -EINVAL;
+ }
+
+ rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+ &sptlrpc_info_lprocfs_fops, dev);
+ if (rc) {
+ CERROR("create proc entry srpc_info for %s: %d\n",
+ dev->obd_name, rc);
+ return rc;
+ }
+
+ rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+ &sptlrpc_ctxs_lprocfs_fops, dev);
+ if (rc) {
+ CERROR("create proc entry srpc_contexts for %s: %d\n",
+ dev->obd_name, rc);
+ return rc;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool);
+static struct lprocfs_vars sptlrpc_lprocfs_vars[] = {
+ { "encrypt_page_pools", &sptlrpc_proc_enc_pool_fops },
+ { NULL }
+};
+
+int sptlrpc_lproc_init(void)
+{
+ int rc;
+
+ LASSERT(sptlrpc_proc_root == NULL);
+
+ sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root,
+ sptlrpc_lprocfs_vars, NULL);
+ if (IS_ERR(sptlrpc_proc_root)) {
+ rc = PTR_ERR(sptlrpc_proc_root);
+ sptlrpc_proc_root = NULL;
+ return rc;
+ }
+ return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+ if (sptlrpc_proc_root) {
+ lprocfs_remove(&sptlrpc_proc_root);
+ sptlrpc_proc_root = NULL;
+ }
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
new file mode 100644
index 000000000..4e132435b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
@@ -0,0 +1,458 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include "../include/obd_support.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec null_sec;
+static struct ptlrpc_cli_ctx null_cli_ctx;
+static struct ptlrpc_svc_ctx null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+ msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+ return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+ /* should never reach here */
+ LBUG();
+ return 0;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+ req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+ if (!req->rq_import->imp_dlm_fake) {
+ struct obd_device *obd = req->rq_import->imp_obd;
+ null_encode_sec_part(req->rq_reqbuf,
+ obd->u.cli.cl_sp_me);
+ }
+ req->rq_reqdata_len = req->rq_reqlen;
+ return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+ __u32 cksums, cksumc;
+
+ LASSERT(req->rq_repdata);
+
+ req->rq_repmsg = req->rq_repdata;
+ req->rq_replen = req->rq_repdata_len;
+
+ if (req->rq_early) {
+ cksums = lustre_msg_get_cksum(req->rq_repdata);
+ cksumc = lustre_msg_calc_cksum(req->rq_repmsg);
+ if (cksumc != cksums) {
+ CDEBUG(D_SEC,
+ "early reply checksum mismatch: %08x != %08x\n",
+ cksumc, cksums);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *svc_ctx,
+ struct sptlrpc_flavor *sf)
+{
+ LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+ /* general layer has take a module reference for us, because we never
+ * really destroy the sec, simply release the reference here.
+ */
+ sptlrpc_policy_put(&null_policy);
+ return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+ LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+ struct vfs_cred *vcred,
+ int create, int remove_dead)
+{
+ atomic_inc(&null_cli_ctx.cc_refcount);
+ return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec,
+ uid_t uid,
+ int grace, int force)
+{
+ return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int msgsize)
+{
+ if (!req->rq_reqbuf) {
+ int alloc_size = size_roundup_power2(msgsize);
+
+ LASSERT(!req->rq_pool);
+ OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+ if (!req->rq_reqbuf)
+ return -ENOMEM;
+
+ req->rq_reqbuf_len = alloc_size;
+ } else {
+ LASSERT(req->rq_pool);
+ LASSERT(req->rq_reqbuf_len >= msgsize);
+ memset(req->rq_reqbuf, 0, msgsize);
+ }
+
+ req->rq_reqmsg = req->rq_reqbuf;
+ return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req)
+{
+ if (!req->rq_pool) {
+ LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+ "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+ req, req->rq_reqmsg, req->rq_reqbuf);
+ LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+ "req %p: reqlen %d should smaller than buflen %d\n",
+ req, req->rq_reqlen, req->rq_reqbuf_len);
+
+ OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+ req->rq_reqbuf = NULL;
+ req->rq_reqbuf_len = 0;
+ }
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int msgsize)
+{
+ /* add space for early replied */
+ msgsize += lustre_msg_early_size();
+
+ msgsize = size_roundup_power2(msgsize);
+
+ OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+ if (!req->rq_repbuf)
+ return -ENOMEM;
+
+ req->rq_repbuf_len = msgsize;
+ return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_repbuf);
+
+ OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+ req->rq_repbuf = NULL;
+ req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int segment, int newsize)
+{
+ struct lustre_msg *newbuf;
+ struct lustre_msg *oldbuf = req->rq_reqmsg;
+ int oldsize, newmsg_size, alloc_size;
+
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+ LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+ LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+ /* compute new message size */
+ oldsize = req->rq_reqbuf->lm_buflens[segment];
+ req->rq_reqbuf->lm_buflens[segment] = newsize;
+ newmsg_size = lustre_packed_msg_size(oldbuf);
+ req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+ /* request from pool should always have enough buffer */
+ LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+ if (req->rq_reqbuf_len < newmsg_size) {
+ alloc_size = size_roundup_power2(newmsg_size);
+
+ OBD_ALLOC_LARGE(newbuf, alloc_size);
+ if (newbuf == NULL)
+ return -ENOMEM;
+
+ /* Must lock this, so that otherwise unprotected change of
+ * rq_reqmsg is not racing with parallel processing of
+ * imp_replay_list traversing threads. See LU-3333
+ * This is a bandaid at best, we really need to deal with this
+ * in request enlarging code before unpacking that's already
+ * there */
+ if (req->rq_import)
+ spin_lock(&req->rq_import->imp_lock);
+ memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+ OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+ req->rq_reqbuf = req->rq_reqmsg = newbuf;
+ req->rq_reqbuf_len = alloc_size;
+
+ if (req->rq_import)
+ spin_unlock(&req->rq_import->imp_lock);
+ }
+
+ _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+ req->rq_reqlen = newmsg_size;
+
+ return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+ .sc_refcount = ATOMIC_INIT(1),
+ .sc_policy = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+ LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+ SPTLRPC_POLICY_NULL);
+
+ if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+ CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+ return SECSVC_DROP;
+ }
+
+ req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+ req->rq_reqmsg = req->rq_reqbuf;
+ req->rq_reqlen = req->rq_reqdata_len;
+
+ req->rq_svc_ctx = &null_svc_ctx;
+ atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+ return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+ struct ptlrpc_reply_state *rs;
+ int rs_size = sizeof(*rs) + msgsize;
+
+ LASSERT(msgsize % 8 == 0);
+
+ rs = req->rq_reply_state;
+
+ if (rs) {
+ /* pre-allocated */
+ LASSERT(rs->rs_size >= rs_size);
+ } else {
+ OBD_ALLOC_LARGE(rs, rs_size);
+ if (rs == NULL)
+ return -ENOMEM;
+
+ rs->rs_size = rs_size;
+ }
+
+ rs->rs_svc_ctx = req->rq_svc_ctx;
+ atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+ rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+ rs->rs_repbuf_len = rs_size - sizeof(*rs);
+ rs->rs_msg = rs->rs_repbuf;
+
+ req->rq_reply_state = rs;
+ return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+ LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+ atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+ if (!rs->rs_prealloc)
+ OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+ LASSERT(rs);
+
+ rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+ rs->rs_repdata_len = req->rq_replen;
+
+ if (likely(req->rq_packed_final)) {
+ if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+ req->rq_reply_off = lustre_msg_early_size();
+ else
+ req->rq_reply_off = 0;
+ } else {
+ __u32 cksum;
+
+ cksum = lustre_msg_calc_cksum(rs->rs_repbuf);
+ lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+ req->rq_reply_off = 0;
+ }
+
+ return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+ .refresh = null_ctx_refresh,
+ .sign = null_ctx_sign,
+ .verify = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+ .create_sec = null_create_sec,
+ .destroy_sec = null_destroy_sec,
+ .lookup_ctx = null_lookup_ctx,
+ .flush_ctx_cache = null_flush_ctx_cache,
+ .alloc_reqbuf = null_alloc_reqbuf,
+ .alloc_repbuf = null_alloc_repbuf,
+ .free_reqbuf = null_free_reqbuf,
+ .free_repbuf = null_free_repbuf,
+ .enlarge_reqbuf = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+ .accept = null_accept,
+ .alloc_rs = null_alloc_rs,
+ .authorize = null_authorize,
+ .free_rs = null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+ .sp_owner = THIS_MODULE,
+ .sp_name = "sec.null",
+ .sp_policy = SPTLRPC_POLICY_NULL,
+ .sp_cops = &null_sec_cops,
+ .sp_sops = &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+ static HLIST_HEAD(__list);
+
+ null_sec.ps_policy = &null_policy;
+ atomic_set(&null_sec.ps_refcount, 1); /* always busy */
+ null_sec.ps_id = -1;
+ null_sec.ps_import = NULL;
+ null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+ null_sec.ps_flvr.sf_flags = 0;
+ null_sec.ps_part = LUSTRE_SP_ANY;
+ null_sec.ps_dying = 0;
+ spin_lock_init(&null_sec.ps_lock);
+ atomic_set(&null_sec.ps_nctx, 1); /* for "null_cli_ctx" */
+ INIT_LIST_HEAD(&null_sec.ps_gc_list);
+ null_sec.ps_gc_interval = 0;
+ null_sec.ps_gc_next = 0;
+
+ hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+ atomic_set(&null_cli_ctx.cc_refcount, 1); /* for hash */
+ null_cli_ctx.cc_sec = &null_sec;
+ null_cli_ctx.cc_ops = &null_ctx_ops;
+ null_cli_ctx.cc_expire = 0;
+ null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+ PTLRPC_CTX_UPTODATE;
+ null_cli_ctx.cc_vcred.vc_uid = 0;
+ spin_lock_init(&null_cli_ctx.cc_lock);
+ INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+ INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+ int rc;
+
+ null_init_internal();
+
+ rc = sptlrpc_register_policy(&null_policy);
+ if (rc)
+ CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+ return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+ int rc;
+
+ rc = sptlrpc_unregister_policy(&null_policy);
+ if (rc)
+ CERROR("failed to unregister %s: %d\n",
+ null_policy.sp_name, rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
new file mode 100644
index 000000000..a79cd5301
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
@@ -0,0 +1,1013 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include "../include/obd_support.h"
+#include "../include/obd_cksum.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_sec.h"
+
+struct plain_sec {
+ struct ptlrpc_sec pls_base;
+ rwlock_t pls_lock;
+ struct ptlrpc_cli_ctx *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+ return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops plain_ctx_ops;
+static struct ptlrpc_svc_ctx plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS (4)
+
+#define PLAIN_PACK_HDR_OFF (0)
+#define PLAIN_PACK_MSG_OFF (1)
+#define PLAIN_PACK_USER_OFF (2)
+#define PLAIN_PACK_BULK_OFF (3)
+
+#define PLAIN_FL_USER (0x01)
+#define PLAIN_FL_BULK (0x02)
+
+struct plain_header {
+ __u8 ph_ver; /* 0 */
+ __u8 ph_flags;
+ __u8 ph_sp; /* source */
+ __u8 ph_bulk_hash_alg; /* complete flavor desc */
+ __u8 ph_pad[4];
+};
+
+struct plain_bulk_token {
+ __u8 pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+ (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/****************************************
+ * bulk checksum helpers *
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+ struct ptlrpc_bulk_sec_desc *bsd;
+
+ if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+ return -EPROTO;
+
+ bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+ if (bsd == NULL) {
+ CERROR("bulk sec desc has short size %d\n",
+ lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+ return -EPROTO;
+ }
+
+ if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+ bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+ CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+ __u8 hash_alg,
+ struct plain_bulk_token *token)
+{
+ if (hash_alg == BULK_HASH_ALG_NULL)
+ return 0;
+
+ memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+ return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+ sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+ __u8 hash_alg,
+ struct plain_bulk_token *tokenr)
+{
+ struct plain_bulk_token tokenv;
+ int rc;
+
+ if (hash_alg == BULK_HASH_ALG_NULL)
+ return 0;
+
+ memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+ rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+ sizeof(tokenv.pbt_hash));
+ if (rc)
+ return rc;
+
+ if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+ return -EACCES;
+ return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+ char *ptr;
+ unsigned int off, i;
+
+ for (i = 0; i < desc->bd_iov_count; i++) {
+ if (desc->bd_iov[i].kiov_len == 0)
+ continue;
+
+ ptr = kmap(desc->bd_iov[i].kiov_page);
+ off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+ ptr[off] ^= 0x1;
+ kunmap(desc->bd_iov[i].kiov_page);
+ return;
+ }
+}
+
+/****************************************
+ * cli_ctx apis *
+ ****************************************/
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+ /* should never reach here */
+ LBUG();
+ return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+ return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+ struct lustre_msg *msg = req->rq_reqbuf;
+ struct plain_header *phdr;
+
+ msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+ phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+ phdr->ph_ver = 0;
+ phdr->ph_flags = 0;
+ phdr->ph_sp = ctx->cc_sec->ps_part;
+ phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+ if (req->rq_pack_udesc)
+ phdr->ph_flags |= PLAIN_FL_USER;
+ if (req->rq_pack_bulk)
+ phdr->ph_flags |= PLAIN_FL_BULK;
+
+ req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+ msg->lm_buflens);
+ return 0;
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+ struct lustre_msg *msg = req->rq_repdata;
+ struct plain_header *phdr;
+ __u32 cksum;
+ int swabbed;
+
+ if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+ CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+ return -EPROTO;
+ }
+
+ swabbed = ptlrpc_rep_need_swab(req);
+
+ phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+ if (phdr == NULL) {
+ CERROR("missing plain header\n");
+ return -EPROTO;
+ }
+
+ if (phdr->ph_ver != 0) {
+ CERROR("Invalid header version\n");
+ return -EPROTO;
+ }
+
+ /* expect no user desc in reply */
+ if (phdr->ph_flags & PLAIN_FL_USER) {
+ CERROR("Unexpected udesc flag in reply\n");
+ return -EPROTO;
+ }
+
+ if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+ CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+ req->rq_flvr.u_bulk.hash.hash_alg);
+ return -EPROTO;
+ }
+
+ if (unlikely(req->rq_early)) {
+ unsigned int hsize = 4;
+
+ cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+ lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+ lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+ NULL, 0, (unsigned char *)&cksum, &hsize);
+ if (cksum != msg->lm_cksum) {
+ CDEBUG(D_SEC,
+ "early reply checksum mismatch: %08x != %08x\n",
+ cpu_to_le32(cksum), msg->lm_cksum);
+ return -EINVAL;
+ }
+ } else {
+ /* whether we sent with bulk or not, we expect the same
+ * in reply, except for early reply */
+ if (!req->rq_early &&
+ !equi(req->rq_pack_bulk == 1,
+ phdr->ph_flags & PLAIN_FL_BULK)) {
+ CERROR("%s bulk checksum in reply\n",
+ req->rq_pack_bulk ? "Missing" : "Unexpected");
+ return -EPROTO;
+ }
+
+ if (phdr->ph_flags & PLAIN_FL_BULK) {
+ if (plain_unpack_bsd(msg, swabbed))
+ return -EPROTO;
+ }
+ }
+
+ req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+ req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+ return 0;
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc)
+{
+ struct ptlrpc_bulk_sec_desc *bsd;
+ struct plain_bulk_token *token;
+ int rc;
+
+ LASSERT(req->rq_pack_bulk);
+ LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+ bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+ token = (struct plain_bulk_token *) bsd->bsd_data;
+
+ bsd->bsd_version = 0;
+ bsd->bsd_flags = 0;
+ bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+ bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+ if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+ return 0;
+
+ if (req->rq_bulk_read)
+ return 0;
+
+ rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+ token);
+ if (rc) {
+ CERROR("bulk write: failed to compute checksum: %d\n", rc);
+ } else {
+ /*
+ * for sending we only compute the wrong checksum instead
+ * of corrupting the data so it is still correct on a redo
+ */
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+ req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+ token->pbt_hash[0] ^= 0x1;
+ }
+
+ return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+ struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc)
+{
+ struct ptlrpc_bulk_sec_desc *bsdv;
+ struct plain_bulk_token *tokenv;
+ int rc;
+ int i, nob;
+
+ LASSERT(req->rq_pack_bulk);
+ LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+ LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+ bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+ tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+ if (req->rq_bulk_write) {
+ if (bsdv->bsd_flags & BSD_FL_ERR)
+ return -EIO;
+ return 0;
+ }
+
+ /* fix the actual data size */
+ for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+ if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) {
+ desc->bd_iov[i].kiov_len =
+ desc->bd_nob_transferred - nob;
+ }
+ nob += desc->bd_iov[i].kiov_len;
+ }
+
+ rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+ tokenv);
+ if (rc)
+ CERROR("bulk read: client verify failed: %d\n", rc);
+
+ return rc;
+}
+
+/****************************************
+ * sec apis *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+ struct ptlrpc_cli_ctx *ctx, *ctx_new;
+
+ OBD_ALLOC_PTR(ctx_new);
+
+ write_lock(&plsec->pls_lock);
+
+ ctx = plsec->pls_ctx;
+ if (ctx) {
+ atomic_inc(&ctx->cc_refcount);
+
+ if (ctx_new)
+ OBD_FREE_PTR(ctx_new);
+ } else if (ctx_new) {
+ ctx = ctx_new;
+
+ atomic_set(&ctx->cc_refcount, 1); /* for cache */
+ ctx->cc_sec = &plsec->pls_base;
+ ctx->cc_ops = &plain_ctx_ops;
+ ctx->cc_expire = 0;
+ ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+ ctx->cc_vcred.vc_uid = 0;
+ spin_lock_init(&ctx->cc_lock);
+ INIT_LIST_HEAD(&ctx->cc_req_list);
+ INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+ plsec->pls_ctx = ctx;
+ atomic_inc(&plsec->pls_base.ps_nctx);
+ atomic_inc(&plsec->pls_base.ps_refcount);
+
+ atomic_inc(&ctx->cc_refcount); /* for caller */
+ }
+
+ write_unlock(&plsec->pls_lock);
+
+ return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+ struct plain_sec *plsec = sec2plsec(sec);
+
+ LASSERT(sec->ps_policy == &plain_policy);
+ LASSERT(sec->ps_import);
+ LASSERT(atomic_read(&sec->ps_refcount) == 0);
+ LASSERT(atomic_read(&sec->ps_nctx) == 0);
+ LASSERT(plsec->pls_ctx == NULL);
+
+ class_import_put(sec->ps_import);
+
+ OBD_FREE_PTR(plsec);
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+ sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+ struct ptlrpc_svc_ctx *svc_ctx,
+ struct sptlrpc_flavor *sf)
+{
+ struct plain_sec *plsec;
+ struct ptlrpc_sec *sec;
+ struct ptlrpc_cli_ctx *ctx;
+
+ LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+ OBD_ALLOC_PTR(plsec);
+ if (plsec == NULL)
+ return NULL;
+
+ /*
+ * initialize plain_sec
+ */
+ rwlock_init(&plsec->pls_lock);
+ plsec->pls_ctx = NULL;
+
+ sec = &plsec->pls_base;
+ sec->ps_policy = &plain_policy;
+ atomic_set(&sec->ps_refcount, 0);
+ atomic_set(&sec->ps_nctx, 0);
+ sec->ps_id = sptlrpc_get_next_secid();
+ sec->ps_import = class_import_get(imp);
+ sec->ps_flvr = *sf;
+ spin_lock_init(&sec->ps_lock);
+ INIT_LIST_HEAD(&sec->ps_gc_list);
+ sec->ps_gc_interval = 0;
+ sec->ps_gc_next = 0;
+
+ /* install ctx immediately if this is a reverse sec */
+ if (svc_ctx) {
+ ctx = plain_sec_install_ctx(plsec);
+ if (ctx == NULL) {
+ plain_destroy_sec(sec);
+ return NULL;
+ }
+ sptlrpc_cli_ctx_put(ctx, 1);
+ }
+
+ return sec;
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+ struct vfs_cred *vcred,
+ int create, int remove_dead)
+{
+ struct plain_sec *plsec = sec2plsec(sec);
+ struct ptlrpc_cli_ctx *ctx;
+
+ read_lock(&plsec->pls_lock);
+ ctx = plsec->pls_ctx;
+ if (ctx)
+ atomic_inc(&ctx->cc_refcount);
+ read_unlock(&plsec->pls_lock);
+
+ if (unlikely(ctx == NULL))
+ ctx = plain_sec_install_ctx(plsec);
+
+ return ctx;
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+ struct ptlrpc_cli_ctx *ctx, int sync)
+{
+ LASSERT(atomic_read(&sec->ps_refcount) > 0);
+ LASSERT(atomic_read(&sec->ps_nctx) > 0);
+ LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+ LASSERT(ctx->cc_sec == sec);
+
+ OBD_FREE_PTR(ctx);
+
+ atomic_dec(&sec->ps_nctx);
+ sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+ uid_t uid, int grace, int force)
+{
+ struct plain_sec *plsec = sec2plsec(sec);
+ struct ptlrpc_cli_ctx *ctx;
+
+ /* do nothing unless caller want to flush for 'all' */
+ if (uid != -1)
+ return 0;
+
+ write_lock(&plsec->pls_lock);
+ ctx = plsec->pls_ctx;
+ plsec->pls_ctx = NULL;
+ write_unlock(&plsec->pls_lock);
+
+ if (ctx)
+ sptlrpc_cli_ctx_put(ctx, 1);
+ return 0;
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int msgsize)
+{
+ __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+ int alloc_len;
+
+ buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+ buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+ if (req->rq_pack_udesc)
+ buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+ if (req->rq_pack_bulk) {
+ LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+ buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+ }
+
+ alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+ if (!req->rq_reqbuf) {
+ LASSERT(!req->rq_pool);
+
+ alloc_len = size_roundup_power2(alloc_len);
+ OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+ if (!req->rq_reqbuf)
+ return -ENOMEM;
+
+ req->rq_reqbuf_len = alloc_len;
+ } else {
+ LASSERT(req->rq_pool);
+ LASSERT(req->rq_reqbuf_len >= alloc_len);
+ memset(req->rq_reqbuf, 0, alloc_len);
+ }
+
+ lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+ req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+ if (req->rq_pack_udesc)
+ sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+ return 0;
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req)
+{
+ if (!req->rq_pool) {
+ OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+ req->rq_reqbuf = NULL;
+ req->rq_reqbuf_len = 0;
+ }
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int msgsize)
+{
+ __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+ int alloc_len;
+
+ buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+ buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+ if (req->rq_pack_bulk) {
+ LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+ buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+ }
+
+ alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+ /* add space for early reply */
+ alloc_len += plain_at_offset;
+
+ alloc_len = size_roundup_power2(alloc_len);
+
+ OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+ if (!req->rq_repbuf)
+ return -ENOMEM;
+
+ req->rq_repbuf_len = alloc_len;
+ return 0;
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req)
+{
+ OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+ req->rq_repbuf = NULL;
+ req->rq_repbuf_len = 0;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int segment, int newsize)
+{
+ struct lustre_msg *newbuf;
+ int oldsize;
+ int newmsg_size, newbuf_size;
+
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+ LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+ req->rq_reqmsg);
+
+ /* compute new embedded msg size. */
+ oldsize = req->rq_reqmsg->lm_buflens[segment];
+ req->rq_reqmsg->lm_buflens[segment] = newsize;
+ newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+ req->rq_reqmsg->lm_buflens);
+ req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+ /* compute new wrapper msg size. */
+ oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+ req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+ newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+ req->rq_reqbuf->lm_buflens);
+ req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+ /* request from pool should always have enough buffer */
+ LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+ if (req->rq_reqbuf_len < newbuf_size) {
+ newbuf_size = size_roundup_power2(newbuf_size);
+
+ OBD_ALLOC_LARGE(newbuf, newbuf_size);
+ if (newbuf == NULL)
+ return -ENOMEM;
+
+ /* Must lock this, so that otherwise unprotected change of
+ * rq_reqmsg is not racing with parallel processing of
+ * imp_replay_list traversing threads. See LU-3333
+ * This is a bandaid at best, we really need to deal with this
+ * in request enlarging code before unpacking that's already
+ * there */
+ if (req->rq_import)
+ spin_lock(&req->rq_import->imp_lock);
+
+ memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+ OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+ req->rq_reqbuf = newbuf;
+ req->rq_reqbuf_len = newbuf_size;
+ req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+ PLAIN_PACK_MSG_OFF, 0);
+
+ if (req->rq_import)
+ spin_unlock(&req->rq_import->imp_lock);
+ }
+
+ _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+ newmsg_size);
+ _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+ req->rq_reqlen = newmsg_size;
+ return 0;
+}
+
+/****************************************
+ * service apis *
+ ****************************************/
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+ .sc_refcount = ATOMIC_INIT(1),
+ .sc_policy = &plain_policy,
+};
+
+static
+int plain_accept(struct ptlrpc_request *req)
+{
+ struct lustre_msg *msg = req->rq_reqbuf;
+ struct plain_header *phdr;
+ int swabbed;
+
+ LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+ SPTLRPC_POLICY_PLAIN);
+
+ if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+ SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+ SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+ SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+ CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+ return SECSVC_DROP;
+ }
+
+ if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+ CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+ return SECSVC_DROP;
+ }
+
+ swabbed = ptlrpc_req_need_swab(req);
+
+ phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+ if (phdr == NULL) {
+ CERROR("missing plain header\n");
+ return -EPROTO;
+ }
+
+ if (phdr->ph_ver != 0) {
+ CERROR("Invalid header version\n");
+ return -EPROTO;
+ }
+
+ if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+ CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+ return -EPROTO;
+ }
+
+ req->rq_sp_from = phdr->ph_sp;
+ req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+ if (phdr->ph_flags & PLAIN_FL_USER) {
+ if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+ swabbed)) {
+ CERROR("Mal-formed user descriptor\n");
+ return SECSVC_DROP;
+ }
+
+ req->rq_pack_udesc = 1;
+ req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+ }
+
+ if (phdr->ph_flags & PLAIN_FL_BULK) {
+ if (plain_unpack_bsd(msg, swabbed))
+ return SECSVC_DROP;
+
+ req->rq_pack_bulk = 1;
+ }
+
+ req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+ req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+ req->rq_svc_ctx = &plain_svc_ctx;
+ atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+ return SECSVC_OK;
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+ struct ptlrpc_reply_state *rs;
+ __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+ int rs_size = sizeof(*rs);
+
+ LASSERT(msgsize % 8 == 0);
+
+ buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+ buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+ if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+ buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+ rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+ rs = req->rq_reply_state;
+
+ if (rs) {
+ /* pre-allocated */
+ LASSERT(rs->rs_size >= rs_size);
+ } else {
+ OBD_ALLOC_LARGE(rs, rs_size);
+ if (rs == NULL)
+ return -ENOMEM;
+
+ rs->rs_size = rs_size;
+ }
+
+ rs->rs_svc_ctx = req->rq_svc_ctx;
+ atomic_inc(&req->rq_svc_ctx->sc_refcount);
+ rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+ rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+ lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+ rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+ req->rq_reply_state = rs;
+ return 0;
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+ LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+ atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+ if (!rs->rs_prealloc)
+ OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct lustre_msg_v2 *msg = rs->rs_repbuf;
+ struct plain_header *phdr;
+ int len;
+
+ LASSERT(rs);
+ LASSERT(msg);
+
+ if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+ len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+ req->rq_replen, 1);
+ else
+ len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+ msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+ phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+ phdr->ph_ver = 0;
+ phdr->ph_flags = 0;
+ phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+ if (req->rq_pack_bulk)
+ phdr->ph_flags |= PLAIN_FL_BULK;
+
+ rs->rs_repdata_len = len;
+
+ if (likely(req->rq_packed_final)) {
+ if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+ req->rq_reply_off = plain_at_offset;
+ else
+ req->rq_reply_off = 0;
+ } else {
+ unsigned int hsize = 4;
+
+ cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+ lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+ lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+ NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize);
+ req->rq_reply_off = 0;
+ }
+
+ return 0;
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+ struct plain_bulk_token *tokenr;
+ int rc;
+
+ LASSERT(req->rq_bulk_write);
+ LASSERT(req->rq_pack_bulk);
+
+ bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+ tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+ bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+ bsdv->bsd_version = 0;
+ bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+ bsdv->bsd_svc = bsdr->bsd_svc;
+ bsdv->bsd_flags = 0;
+
+ if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+ return 0;
+
+ rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+ tokenr);
+ if (rc) {
+ bsdv->bsd_flags |= BSD_FL_ERR;
+ CERROR("bulk write: server verify failed: %d\n", rc);
+ }
+
+ return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+ struct ptlrpc_bulk_desc *desc)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+ struct plain_bulk_token *tokenv;
+ int rc;
+
+ LASSERT(req->rq_bulk_read);
+ LASSERT(req->rq_pack_bulk);
+
+ bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+ bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+ tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+ bsdv->bsd_version = 0;
+ bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+ bsdv->bsd_svc = bsdr->bsd_svc;
+ bsdv->bsd_flags = 0;
+
+ if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+ return 0;
+
+ rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+ tokenv);
+ if (rc) {
+ CERROR("bulk read: server failed to compute checksum: %d\n",
+ rc);
+ } else {
+ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+ corrupt_bulk_data(desc);
+ }
+
+ return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+ .refresh = plain_ctx_refresh,
+ .validate = plain_ctx_validate,
+ .sign = plain_ctx_sign,
+ .verify = plain_ctx_verify,
+ .wrap_bulk = plain_cli_wrap_bulk,
+ .unwrap_bulk = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+ .create_sec = plain_create_sec,
+ .destroy_sec = plain_destroy_sec,
+ .kill_sec = plain_kill_sec,
+ .lookup_ctx = plain_lookup_ctx,
+ .release_ctx = plain_release_ctx,
+ .flush_ctx_cache = plain_flush_ctx_cache,
+ .alloc_reqbuf = plain_alloc_reqbuf,
+ .free_reqbuf = plain_free_reqbuf,
+ .alloc_repbuf = plain_alloc_repbuf,
+ .free_repbuf = plain_free_repbuf,
+ .enlarge_reqbuf = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+ .accept = plain_accept,
+ .alloc_rs = plain_alloc_rs,
+ .authorize = plain_authorize,
+ .free_rs = plain_free_rs,
+ .unwrap_bulk = plain_svc_unwrap_bulk,
+ .wrap_bulk = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+ .sp_owner = THIS_MODULE,
+ .sp_name = "plain",
+ .sp_policy = SPTLRPC_POLICY_PLAIN,
+ .sp_cops = &plain_sec_cops,
+ .sp_sops = &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+ __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+ int rc;
+
+ buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+ plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+ rc = sptlrpc_register_policy(&plain_policy);
+ if (rc)
+ CERROR("failed to register: %d\n", rc);
+
+ return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+ int rc;
+
+ rc = sptlrpc_unregister_policy(&plain_policy);
+ if (rc)
+ CERROR("cannot unregister: %d\n", rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
new file mode 100644
index 000000000..8e6142151
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -0,0 +1,3105 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lu_object.h"
+#include "../../include/linux/lnet/types.h"
+#include "ptlrpc_internal.h"
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+module_param(test_req_buffer_pressure, int, 0444);
+MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools");
+module_param(at_min, int, 0644);
+MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)");
+module_param(at_max, int, 0644);
+MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)");
+module_param(at_history, int, 0644);
+MODULE_PARM_DESC(at_history,
+ "Adaptive timeouts remember the slowest event that took place within this period (sec)");
+module_param(at_early_margin, int, 0644);
+MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply");
+module_param(at_extra, int, 0644);
+MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply");
+
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+
+/** Holds a list of all PTLRPC services */
+LIST_HEAD(ptlrpc_all_services);
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ struct ptlrpc_request_buffer_desc *rqbd;
+
+ OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+ if (rqbd == NULL)
+ return NULL;
+
+ rqbd->rqbd_svcpt = svcpt;
+ rqbd->rqbd_refcount = 0;
+ rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+ rqbd->rqbd_cbid.cbid_arg = rqbd;
+ INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+ OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+ svcpt->scp_cpt, svc->srv_buf_size);
+ if (rqbd->rqbd_buffer == NULL) {
+ OBD_FREE_PTR(rqbd);
+ return NULL;
+ }
+
+ spin_lock(&svcpt->scp_lock);
+ list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+ svcpt->scp_nrqbds_total++;
+ spin_unlock(&svcpt->scp_lock);
+
+ return rqbd;
+}
+
+void
+ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+ struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+ LASSERT(rqbd->rqbd_refcount == 0);
+ LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+ spin_lock(&svcpt->scp_lock);
+ list_del(&rqbd->rqbd_list);
+ svcpt->scp_nrqbds_total--;
+ spin_unlock(&svcpt->scp_lock);
+
+ OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+ OBD_FREE_PTR(rqbd);
+}
+
+int
+ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ struct ptlrpc_request_buffer_desc *rqbd;
+ int rc = 0;
+ int i;
+
+ if (svcpt->scp_rqbd_allocating)
+ goto try_post;
+
+ spin_lock(&svcpt->scp_lock);
+ /* check again with lock */
+ if (svcpt->scp_rqbd_allocating) {
+ /* NB: we might allow more than one thread in the future */
+ LASSERT(svcpt->scp_rqbd_allocating == 1);
+ spin_unlock(&svcpt->scp_lock);
+ goto try_post;
+ }
+
+ svcpt->scp_rqbd_allocating++;
+ spin_unlock(&svcpt->scp_lock);
+
+
+ for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+ /* NB: another thread might have recycled enough rqbds, we
+ * need to make sure it wouldn't over-allocate, see LU-1212. */
+ if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+ break;
+
+ rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+ if (rqbd == NULL) {
+ CERROR("%s: Can't allocate request buffer\n",
+ svc->srv_name);
+ rc = -ENOMEM;
+ break;
+ }
+ }
+
+ spin_lock(&svcpt->scp_lock);
+
+ LASSERT(svcpt->scp_rqbd_allocating == 1);
+ svcpt->scp_rqbd_allocating--;
+
+ spin_unlock(&svcpt->scp_lock);
+
+ CDEBUG(D_RPCTRACE,
+ "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+ svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+ svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+ if (post && rc == 0)
+ rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+ return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state associated to request reply.
+ */
+void
+ptlrpc_save_lock(struct ptlrpc_request *req,
+ struct lustre_handle *lock, int mode, int no_ack)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ int idx;
+
+ LASSERT(rs != NULL);
+ LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+ if (req->rq_export->exp_disconnected) {
+ ldlm_lock_decref(lock, mode);
+ } else {
+ idx = rs->rs_nlocks++;
+ rs->rs_locks[idx] = *lock;
+ rs->rs_modes[idx] = mode;
+ rs->rs_difficult = 1;
+ rs->rs_no_ack = !!no_ack;
+ }
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+ int hrt_id; /* thread ID */
+ spinlock_t hrt_lock;
+ wait_queue_head_t hrt_waitq;
+ struct list_head hrt_queue; /* RS queue */
+ struct ptlrpc_hr_partition *hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+ /* # of started threads */
+ atomic_t hrp_nstarted;
+ /* # of stopped threads */
+ atomic_t hrp_nstopped;
+ /* cpu partition id */
+ int hrp_cpt;
+ /* round-robin rotor for choosing thread */
+ int hrp_rotor;
+ /* total number of threads on this partition */
+ int hrp_nthrs;
+ /* threads table */
+ struct ptlrpc_hr_thread *hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+ /* CPU partition table, it's just cfs_cpt_table for now */
+ struct cfs_cpt_table *hr_cpt_table;
+ /** controller sleep waitq */
+ wait_queue_head_t hr_waitq;
+ unsigned int hr_stopping;
+ /** roundrobin rotor for non-affinity service */
+ unsigned int hr_rotor;
+ /* partition data */
+ struct ptlrpc_hr_partition **hr_partitions;
+};
+
+struct rs_batch {
+ struct list_head rsb_replies;
+ unsigned int rsb_n_replies;
+ struct ptlrpc_service_part *rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service ptlrpc_hr;
+
+/**
+ * maximum number of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+ memset(b, 0, sizeof(*b));
+ INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static struct ptlrpc_hr_thread *
+ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_hr_partition *hrp;
+ unsigned int rotor;
+
+ if (svcpt->scp_cpt >= 0 &&
+ svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+ /* directly match partition */
+ hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+ } else {
+ rotor = ptlrpc_hr.hr_rotor++;
+ rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+ hrp = ptlrpc_hr.hr_partitions[rotor];
+ }
+
+ rotor = hrp->hrp_rotor++;
+ return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+ if (b->rsb_n_replies != 0) {
+ struct ptlrpc_hr_thread *hrt;
+
+ hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+ spin_lock(&hrt->hrt_lock);
+ list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+ spin_unlock(&hrt->hrt_lock);
+
+ wake_up(&hrt->hrt_waitq);
+ b->rsb_n_replies = 0;
+ }
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+ struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+ if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+ if (b->rsb_svcpt != NULL) {
+ rs_batch_dispatch(b);
+ spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+ }
+ spin_lock(&svcpt->scp_rep_lock);
+ b->rsb_svcpt = svcpt;
+ }
+ spin_lock(&rs->rs_lock);
+ rs->rs_scheduled_ever = 1;
+ if (rs->rs_scheduled == 0) {
+ list_move(&rs->rs_list, &b->rsb_replies);
+ rs->rs_scheduled = 1;
+ b->rsb_n_replies++;
+ }
+ rs->rs_committed = 1;
+ spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+ if (b->rsb_svcpt != NULL) {
+ rs_batch_dispatch(b);
+ spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+ }
+}
+
+#define DECLARE_RS_BATCH(b) struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+ struct ptlrpc_hr_thread *hrt;
+
+ LASSERT(list_empty(&rs->rs_list));
+
+ hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+ spin_lock(&hrt->hrt_lock);
+ list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+ spin_unlock(&hrt->hrt_lock);
+
+ wake_up(&hrt->hrt_waitq);
+}
+
+void
+ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+ assert_spin_locked(&rs->rs_svcpt->scp_rep_lock);
+ assert_spin_locked(&rs->rs_lock);
+ LASSERT(rs->rs_difficult);
+ rs->rs_scheduled_ever = 1; /* flag any notification attempt */
+
+ if (rs->rs_scheduled) { /* being set up or already notified */
+ return;
+ }
+
+ rs->rs_scheduled = 1;
+ list_del_init(&rs->rs_list);
+ ptlrpc_dispatch_difficult_reply(rs);
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+ struct ptlrpc_reply_state *rs, *nxt;
+ DECLARE_RS_BATCH(batch);
+
+ rs_batch_init(&batch);
+ /* Find any replies that have been committed and get their service
+ * to attend to complete them. */
+
+ /* CAVEAT EMPTOR: spinlock ordering!!! */
+ spin_lock(&exp->exp_uncommitted_replies_lock);
+ list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+ rs_obd_list) {
+ LASSERT(rs->rs_difficult);
+ /* VBR: per-export last_committed */
+ LASSERT(rs->rs_export);
+ if (rs->rs_transno <= exp->exp_last_committed) {
+ list_del_init(&rs->rs_obd_list);
+ rs_batch_add(&batch, rs);
+ }
+ }
+ spin_unlock(&exp->exp_uncommitted_replies_lock);
+ rs_batch_fini(&batch);
+}
+EXPORT_SYMBOL(ptlrpc_commit_replies);
+
+static int
+ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_request_buffer_desc *rqbd;
+ int rc;
+ int posted = 0;
+
+ for (;;) {
+ spin_lock(&svcpt->scp_lock);
+
+ if (list_empty(&svcpt->scp_rqbd_idle)) {
+ spin_unlock(&svcpt->scp_lock);
+ return posted;
+ }
+
+ rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+ struct ptlrpc_request_buffer_desc,
+ rqbd_list);
+ list_del(&rqbd->rqbd_list);
+
+ /* assume we will post successfully */
+ svcpt->scp_nrqbds_posted++;
+ list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+ spin_unlock(&svcpt->scp_lock);
+
+ rc = ptlrpc_register_rqbd(rqbd);
+ if (rc != 0)
+ break;
+
+ posted = 1;
+ }
+
+ spin_lock(&svcpt->scp_lock);
+
+ svcpt->scp_nrqbds_posted--;
+ list_del(&rqbd->rqbd_list);
+ list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+ /* Don't complain if no request buffers are posted right now; LNET
+ * won't drop requests because we set the portal lazy! */
+
+ spin_unlock(&svcpt->scp_lock);
+
+ return -1;
+}
+
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+ struct ptlrpc_service_part *svcpt;
+
+ svcpt = (struct ptlrpc_service_part *)castmeharder;
+
+ svcpt->scp_at_check = 1;
+ svcpt->scp_at_checktime = cfs_time_current();
+ wake_up(&svcpt->scp_waitq);
+}
+
+static void
+ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+ struct ptlrpc_service_conf *conf)
+{
+ struct ptlrpc_service_thr_conf *tc = &conf->psc_thr;
+ unsigned init;
+ unsigned total;
+ unsigned nthrs;
+ int weight;
+
+ /*
+ * Common code for estimating & validating threads number.
+ * CPT affinity service could have percpt thread-pool instead
+ * of a global thread-pool, which means user might not always
+ * get the threads number they give it in conf::tc_nthrs_user
+ * even they did set. It's because we need to validate threads
+ * number for each CPT to guarantee each pool will have enough
+ * threads to keep the service healthy.
+ */
+ init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+ init = max_t(int, init, tc->tc_nthrs_init);
+
+ /* NB: please see comments in lustre_lnet.h for definition
+ * details of these members */
+ LASSERT(tc->tc_nthrs_max != 0);
+
+ if (tc->tc_nthrs_user != 0) {
+ /* In case there is a reason to test a service with many
+ * threads, we give a less strict check here, it can
+ * be up to 8 * nthrs_max */
+ total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+ nthrs = total / svc->srv_ncpts;
+ init = max(init, nthrs);
+ goto out;
+ }
+
+ total = tc->tc_nthrs_max;
+ if (tc->tc_nthrs_base == 0) {
+ /* don't care about base threads number per partition,
+ * this is most for non-affinity service */
+ nthrs = total / svc->srv_ncpts;
+ goto out;
+ }
+
+ nthrs = tc->tc_nthrs_base;
+ if (svc->srv_ncpts == 1) {
+ int i;
+
+ /* NB: Increase the base number if it's single partition
+ * and total number of cores/HTs is larger or equal to 4.
+ * result will always < 2 * nthrs_base */
+ weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+ for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+ (tc->tc_nthrs_base >> i) != 0; i++)
+ nthrs += tc->tc_nthrs_base >> i;
+ }
+
+ if (tc->tc_thr_factor != 0) {
+ int factor = tc->tc_thr_factor;
+ const int fade = 4;
+
+ /*
+ * User wants to increase number of threads with for
+ * each CPU core/HT, most likely the factor is larger then
+ * one thread/core because service threads are supposed to
+ * be blocked by lock or wait for IO.
+ */
+ /*
+ * Amdahl's law says that adding processors wouldn't give
+ * a linear increasing of parallelism, so it's nonsense to
+ * have too many threads no matter how many cores/HTs
+ * there are.
+ */
+ /* weight is # of HTs */
+ if (cpumask_weight(topology_thread_cpumask(0)) > 1) {
+ /* depress thread factor for hyper-thread */
+ factor = factor - (factor >> 1) + (factor >> 3);
+ }
+
+ weight = cfs_cpt_weight(svc->srv_cptable, 0);
+ LASSERT(weight > 0);
+
+ for (; factor > 0 && weight > 0; factor--, weight -= fade)
+ nthrs += min(weight, fade) * factor;
+ }
+
+ if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+ nthrs = max(tc->tc_nthrs_base,
+ tc->tc_nthrs_max / svc->srv_ncpts);
+ }
+ out:
+ nthrs = max(nthrs, tc->tc_nthrs_init);
+ svc->srv_nthrs_cpt_limit = nthrs;
+ svc->srv_nthrs_cpt_init = init;
+
+ if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+ CDEBUG(D_OTHER, "%s: This service may have more threads (%d) than the given soft limit (%d)\n",
+ svc->srv_name, nthrs * svc->srv_ncpts,
+ tc->tc_nthrs_max);
+ }
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int
+ptlrpc_service_part_init(struct ptlrpc_service *svc,
+ struct ptlrpc_service_part *svcpt, int cpt)
+{
+ struct ptlrpc_at_array *array;
+ int size;
+ int index;
+ int rc;
+
+ svcpt->scp_cpt = cpt;
+ INIT_LIST_HEAD(&svcpt->scp_threads);
+
+ /* rqbd and incoming request queue */
+ spin_lock_init(&svcpt->scp_lock);
+ INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+ INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+ INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+ init_waitqueue_head(&svcpt->scp_waitq);
+ /* history request & rqbd list */
+ INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+ INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+ /* active requests and hp requests */
+ spin_lock_init(&svcpt->scp_req_lock);
+
+ /* reply states */
+ spin_lock_init(&svcpt->scp_rep_lock);
+ INIT_LIST_HEAD(&svcpt->scp_rep_active);
+ INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+ init_waitqueue_head(&svcpt->scp_rep_waitq);
+ atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+ /* adaptive timeout */
+ spin_lock_init(&svcpt->scp_at_lock);
+ array = &svcpt->scp_at_array;
+
+ size = at_est2timeout(at_max);
+ array->paa_size = size;
+ array->paa_count = 0;
+ array->paa_deadline = -1;
+
+ /* allocate memory for scp_at_array (ptlrpc_at_array) */
+ OBD_CPT_ALLOC(array->paa_reqs_array,
+ svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+ if (array->paa_reqs_array == NULL)
+ return -ENOMEM;
+
+ for (index = 0; index < size; index++)
+ INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+ OBD_CPT_ALLOC(array->paa_reqs_count,
+ svc->srv_cptable, cpt, sizeof(__u32) * size);
+ if (array->paa_reqs_count == NULL)
+ goto failed;
+
+ cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt);
+ /* At SOW, service time should be quick; 10s seems generous. If client
+ * timeout is less than this, we'll be sending an early reply. */
+ at_init(&svcpt->scp_at_estimate, 10, 0);
+
+ /* assign this before call ptlrpc_grow_req_bufs */
+ svcpt->scp_service = svc;
+ /* Now allocate the request buffers, but don't post them now */
+ rc = ptlrpc_grow_req_bufs(svcpt, 0);
+ /* We shouldn't be under memory pressure at startup, so
+ * fail if we can't allocate all our buffers at this time. */
+ if (rc != 0)
+ goto failed;
+
+ return 0;
+
+ failed:
+ if (array->paa_reqs_count != NULL) {
+ OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size);
+ array->paa_reqs_count = NULL;
+ }
+
+ if (array->paa_reqs_array != NULL) {
+ OBD_FREE(array->paa_reqs_array,
+ sizeof(struct list_head) * array->paa_size);
+ array->paa_reqs_array = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *
+ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+ struct proc_dir_entry *proc_entry)
+{
+ struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt;
+ struct ptlrpc_service *service;
+ struct ptlrpc_service_part *svcpt;
+ struct cfs_cpt_table *cptable;
+ __u32 *cpts = NULL;
+ int ncpts;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT(conf->psc_buf.bc_nbufs > 0);
+ LASSERT(conf->psc_buf.bc_buf_size >=
+ conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+ LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+ cptable = cconf->cc_cptable;
+ if (cptable == NULL)
+ cptable = cfs_cpt_table;
+
+ if (!conf->psc_thr.tc_cpu_affinity) {
+ ncpts = 1;
+ } else {
+ ncpts = cfs_cpt_number(cptable);
+ if (cconf->cc_pattern != NULL) {
+ struct cfs_expr_list *el;
+
+ rc = cfs_expr_list_parse(cconf->cc_pattern,
+ strlen(cconf->cc_pattern),
+ 0, ncpts - 1, &el);
+ if (rc != 0) {
+ CERROR("%s: invalid CPT pattern string: %s",
+ conf->psc_name, cconf->cc_pattern);
+ return ERR_PTR(-EINVAL);
+ }
+
+ rc = cfs_expr_list_values(el, ncpts, &cpts);
+ cfs_expr_list_free(el);
+ if (rc <= 0) {
+ CERROR("%s: failed to parse CPT array %s: %d\n",
+ conf->psc_name, cconf->cc_pattern, rc);
+ if (cpts != NULL)
+ OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+ return ERR_PTR(rc < 0 ? rc : -EINVAL);
+ }
+ ncpts = rc;
+ }
+ }
+
+ OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+ if (service == NULL) {
+ if (cpts != NULL)
+ OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ service->srv_cptable = cptable;
+ service->srv_cpts = cpts;
+ service->srv_ncpts = ncpts;
+
+ service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+ while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+ service->srv_cpt_bits++;
+
+ /* public members */
+ spin_lock_init(&service->srv_lock);
+ service->srv_name = conf->psc_name;
+ service->srv_watchdog_factor = conf->psc_watchdog_factor;
+ INIT_LIST_HEAD(&service->srv_list); /* for safety of cleanup */
+
+ /* buffer configuration */
+ service->srv_nbuf_per_group = test_req_buffer_pressure ?
+ 1 : conf->psc_buf.bc_nbufs;
+ service->srv_max_req_size = conf->psc_buf.bc_req_max_size +
+ SPTLRPC_MAX_PAYLOAD;
+ service->srv_buf_size = conf->psc_buf.bc_buf_size;
+ service->srv_rep_portal = conf->psc_buf.bc_rep_portal;
+ service->srv_req_portal = conf->psc_buf.bc_req_portal;
+
+ /* Increase max reply size to next power of two */
+ service->srv_max_reply_size = 1;
+ while (service->srv_max_reply_size <
+ conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+ service->srv_max_reply_size <<= 1;
+
+ service->srv_thread_name = conf->psc_thr.tc_thr_name;
+ service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags;
+ service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO;
+ service->srv_ops = conf->psc_ops;
+
+ for (i = 0; i < ncpts; i++) {
+ if (!conf->psc_thr.tc_cpu_affinity)
+ cpt = CFS_CPT_ANY;
+ else
+ cpt = cpts != NULL ? cpts[i] : i;
+
+ OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+ if (svcpt == NULL) {
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ service->srv_parts[i] = svcpt;
+ rc = ptlrpc_service_part_init(service, svcpt, cpt);
+ if (rc != 0)
+ goto failed;
+ }
+
+ ptlrpc_server_nthreads_check(service, conf);
+
+ rc = LNetSetLazyPortal(service->srv_req_portal);
+ LASSERT(rc == 0);
+
+ mutex_lock(&ptlrpc_all_services_mutex);
+ list_add(&service->srv_list, &ptlrpc_all_services);
+ mutex_unlock(&ptlrpc_all_services_mutex);
+
+ if (proc_entry != NULL)
+ ptlrpc_lprocfs_register_service(proc_entry, service);
+
+ rc = ptlrpc_service_nrs_setup(service);
+ if (rc != 0)
+ goto failed;
+
+ CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+ service->srv_name, service->srv_req_portal);
+
+ rc = ptlrpc_start_threads(service);
+ if (rc != 0) {
+ CERROR("Failed to start threads for service %s: %d\n",
+ service->srv_name, rc);
+ goto failed;
+ }
+
+ return service;
+failed:
+ ptlrpc_unregister_service(service);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+ LASSERT(atomic_read(&req->rq_refcount) == 0);
+ LASSERT(list_empty(&req->rq_timed_list));
+
+ /* DEBUG_REQ() assumes the reply state of a request with a valid
+ * ref will not be destroyed until that reference is dropped. */
+ ptlrpc_req_drop_rs(req);
+
+ sptlrpc_svc_ctx_decref(req);
+
+ if (req != &req->rq_rqbd->rqbd_req) {
+ /* NB request buffers use an embedded
+ * req if the incoming req unlinked the
+ * MD; this isn't one of them! */
+ ptlrpc_request_cache_free(req);
+ }
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+ struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+ struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ int refcount;
+ struct list_head *tmp;
+ struct list_head *nxt;
+
+ if (!atomic_dec_and_test(&req->rq_refcount))
+ return;
+
+ if (req->rq_at_linked) {
+ spin_lock(&svcpt->scp_at_lock);
+ /* recheck with lock, in case it's unlinked by
+ * ptlrpc_at_check_timed() */
+ if (likely(req->rq_at_linked))
+ ptlrpc_at_remove_timed(req);
+ spin_unlock(&svcpt->scp_at_lock);
+ }
+
+ LASSERT(list_empty(&req->rq_timed_list));
+
+ /* finalize request */
+ if (req->rq_export) {
+ class_export_put(req->rq_export);
+ req->rq_export = NULL;
+ }
+
+ spin_lock(&svcpt->scp_lock);
+
+ list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+ refcount = --(rqbd->rqbd_refcount);
+ if (refcount == 0) {
+ /* request buffer is now idle: add to history */
+ list_del(&rqbd->rqbd_list);
+
+ list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+ svcpt->scp_hist_nrqbds++;
+
+ /* cull some history?
+ * I expect only about 1 or 2 rqbds need to be recycled here */
+ while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+ rqbd = list_entry(svcpt->scp_hist_rqbds.next,
+ struct ptlrpc_request_buffer_desc,
+ rqbd_list);
+
+ list_del(&rqbd->rqbd_list);
+ svcpt->scp_hist_nrqbds--;
+
+ /* remove rqbd's reqs from svc's req history while
+ * I've got the service lock */
+ list_for_each(tmp, &rqbd->rqbd_reqs) {
+ req = list_entry(tmp, struct ptlrpc_request,
+ rq_list);
+ /* Track the highest culled req seq */
+ if (req->rq_history_seq >
+ svcpt->scp_hist_seq_culled) {
+ svcpt->scp_hist_seq_culled =
+ req->rq_history_seq;
+ }
+ list_del(&req->rq_history_list);
+ }
+
+ spin_unlock(&svcpt->scp_lock);
+
+ list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) {
+ req = list_entry(rqbd->rqbd_reqs.next,
+ struct ptlrpc_request,
+ rq_list);
+ list_del(&req->rq_list);
+ ptlrpc_server_free_request(req);
+ }
+
+ spin_lock(&svcpt->scp_lock);
+ /*
+ * now all reqs including the embedded req has been
+ * disposed, schedule request buffer for re-use.
+ */
+ LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) ==
+ 0);
+ list_add_tail(&rqbd->rqbd_list,
+ &svcpt->scp_rqbd_idle);
+ }
+
+ spin_unlock(&svcpt->scp_lock);
+ } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+ /* If we are low on memory, we are not interested in history */
+ list_del(&req->rq_list);
+ list_del_init(&req->rq_history_list);
+
+ /* Track the highest culled req seq */
+ if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+ svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+ spin_unlock(&svcpt->scp_lock);
+
+ ptlrpc_server_free_request(req);
+ } else {
+ spin_unlock(&svcpt->scp_lock);
+ }
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+ struct obd_export *export)
+{
+ if (req->rq_export != NULL) {
+ if (!list_empty(&req->rq_exp_list)) {
+ /* remove rq_exp_list from last export */
+ spin_lock_bh(&req->rq_export->exp_rpc_lock);
+ list_del_init(&req->rq_exp_list);
+ spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+
+ /* export has one reference already, so it`s safe to
+ * add req to export queue here and get another
+ * reference for request later */
+ spin_lock_bh(&export->exp_rpc_lock);
+ list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+ spin_unlock_bh(&export->exp_rpc_lock);
+ }
+ class_export_rpc_dec(req->rq_export);
+ class_export_put(req->rq_export);
+ }
+
+ /* request takes one export refcount */
+ req->rq_export = class_export_get(export);
+ class_export_rpc_inc(export);
+
+ return;
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req)
+{
+ ptlrpc_server_hpreq_fini(req);
+
+ ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish a active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+ struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req)
+{
+ spin_lock(&svcpt->scp_req_lock);
+ ptlrpc_nrs_req_stop_nolock(req);
+ svcpt->scp_nreqs_active--;
+ if (req->rq_hp)
+ svcpt->scp_nhreqs_active--;
+ spin_unlock(&svcpt->scp_req_lock);
+
+ ptlrpc_nrs_req_finalize(req);
+
+ if (req->rq_export != NULL)
+ class_export_rpc_dec(req->rq_export);
+
+ ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+ struct obd_export *oldest_exp;
+ time_t oldest_time, new_time;
+
+ LASSERT(exp);
+
+ /* Compensate for slow machines, etc, by faking our request time
+ into the future. Although this can break the strict time-ordering
+ of the list, we can be really lazy here - we don't have to evict
+ at the exact right moment. Eventually, all silent exports
+ will make it to the top of the list. */
+
+ /* Do not pay attention on 1sec or smaller renewals. */
+ new_time = get_seconds() + extra_delay;
+ if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+ return;
+
+ exp->exp_last_request_time = new_time;
+
+ /* exports may get disconnected from the chain even though the
+ export has references, so we must keep the spin lock while
+ manipulating the lists */
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+
+ if (list_empty(&exp->exp_obd_chain_timed)) {
+ /* this one is not timed */
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ return;
+ }
+
+ list_move_tail(&exp->exp_obd_chain_timed,
+ &exp->exp_obd->obd_exports_timed);
+
+ oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+ struct obd_export, exp_obd_chain_timed);
+ oldest_time = oldest_exp->exp_last_request_time;
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+ if (exp->exp_obd->obd_recovering) {
+ /* be nice to everyone during recovery */
+ return;
+ }
+
+ /* Note - racing to start/reset the obd_eviction timer is safe */
+ if (exp->exp_obd->obd_eviction_timer == 0) {
+ /* Check if the oldest entry is expired. */
+ if (get_seconds() > (oldest_time + PING_EVICT_TIMEOUT +
+ extra_delay)) {
+ /* We need a second timer, in case the net was down and
+ * it just came back. Since the pinger may skip every
+ * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+ * we better wait for 3. */
+ exp->exp_obd->obd_eviction_timer =
+ get_seconds() + 3 * PING_INTERVAL;
+ CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n",
+ exp->exp_obd->obd_name,
+ obd_export_nid2str(oldest_exp), oldest_time);
+ }
+ } else {
+ if (get_seconds() >
+ (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+ /* The evictor won't evict anyone who we've heard from
+ * recently, so we don't have to check before we start
+ * it. */
+ if (!ping_evictor_wake(exp))
+ exp->exp_obd->obd_eviction_timer = 0;
+ }
+ }
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+ struct obd_device *obd = req->rq_export->exp_obd;
+ int rc = 0;
+
+ if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+ req->rq_export->exp_conn_cnt)) {
+ DEBUG_REQ(D_RPCTRACE, req,
+ "DROPPING req from old connection %d < %d",
+ lustre_msg_get_conn_cnt(req->rq_reqmsg),
+ req->rq_export->exp_conn_cnt);
+ return -EEXIST;
+ }
+ if (unlikely(obd == NULL || obd->obd_fail)) {
+ /*
+ * Failing over, don't handle any more reqs, send
+ * error response instead.
+ */
+ CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+ req, (obd != NULL) ? obd->obd_name : "unknown");
+ rc = -ENODEV;
+ } else if (lustre_msg_get_flags(req->rq_reqmsg) &
+ (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+ !obd->obd_recovering) {
+ DEBUG_REQ(D_ERROR, req,
+ "Invalid replay without recovery");
+ class_fail_export(req->rq_export);
+ rc = -ENODEV;
+ } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+ !obd->obd_recovering) {
+ DEBUG_REQ(D_ERROR, req, "Invalid req with transno %llu without recovery",
+ lustre_msg_get_transno(req->rq_reqmsg));
+ class_fail_export(req->rq_export);
+ rc = -ENODEV;
+ }
+
+ if (unlikely(rc < 0)) {
+ req->rq_status = rc;
+ ptlrpc_error(req);
+ }
+ return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+ __s32 next;
+
+ if (array->paa_count == 0) {
+ cfs_timer_disarm(&svcpt->scp_at_timer);
+ return;
+ }
+
+ /* Set timer for closest deadline */
+ next = (__s32)(array->paa_deadline - get_seconds() -
+ at_early_margin);
+ if (next <= 0) {
+ ptlrpc_at_timer((unsigned long)svcpt);
+ } else {
+ cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next));
+ CDEBUG(D_INFO, "armed %s at %+ds\n",
+ svcpt->scp_service->srv_name, next);
+ }
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+ struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+ struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+ struct ptlrpc_request *rq = NULL;
+ __u32 index;
+
+ if (AT_OFF)
+ return 0;
+
+ if (req->rq_no_reply)
+ return 0;
+
+ if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+ return -ENOSYS;
+
+ spin_lock(&svcpt->scp_at_lock);
+ LASSERT(list_empty(&req->rq_timed_list));
+
+ index = (unsigned long)req->rq_deadline % array->paa_size;
+ if (array->paa_reqs_count[index] > 0) {
+ /* latest rpcs will have the latest deadlines in the list,
+ * so search backward. */
+ list_for_each_entry_reverse(rq,
+ &array->paa_reqs_array[index],
+ rq_timed_list) {
+ if (req->rq_deadline >= rq->rq_deadline) {
+ list_add(&req->rq_timed_list,
+ &rq->rq_timed_list);
+ break;
+ }
+ }
+ }
+
+ /* Add the request at the head of the list */
+ if (list_empty(&req->rq_timed_list))
+ list_add(&req->rq_timed_list,
+ &array->paa_reqs_array[index]);
+
+ spin_lock(&req->rq_lock);
+ req->rq_at_linked = 1;
+ spin_unlock(&req->rq_lock);
+ req->rq_at_index = index;
+ array->paa_reqs_count[index]++;
+ array->paa_count++;
+ if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+ array->paa_deadline = req->rq_deadline;
+ ptlrpc_at_set_timer(svcpt);
+ }
+ spin_unlock(&svcpt->scp_at_lock);
+
+ return 0;
+}
+
+static void
+ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+ struct ptlrpc_at_array *array;
+
+ array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+ /* NB: must call with hold svcpt::scp_at_lock */
+ LASSERT(!list_empty(&req->rq_timed_list));
+ list_del_init(&req->rq_timed_list);
+
+ spin_lock(&req->rq_lock);
+ req->rq_at_linked = 0;
+ spin_unlock(&req->rq_lock);
+
+ array->paa_reqs_count[req->rq_at_index]--;
+ array->paa_count--;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+ struct ptlrpc_request *reqcopy;
+ struct lustre_msg *reqmsg;
+ long olddl = req->rq_deadline - get_seconds();
+ time_t newdl;
+ int rc;
+
+ /* deadline is when the client expects us to reply, margin is the
+ difference between clients' and servers' expectations */
+ DEBUG_REQ(D_ADAPTTO, req,
+ "%ssending early reply (deadline %+lds, margin %+lds) for %d+%d",
+ AT_OFF ? "AT off - not " : "",
+ olddl, olddl - at_get(&svcpt->scp_at_estimate),
+ at_get(&svcpt->scp_at_estimate), at_extra);
+
+ if (AT_OFF)
+ return 0;
+
+ if (olddl < 0) {
+ DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), not sending early reply. Consider increasing at_early_margin (%d)?",
+ olddl, at_early_margin);
+
+ /* Return an error so we're not re-added to the timed list. */
+ return -ETIMEDOUT;
+ }
+
+ if (!(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+ DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, but no AT support");
+ return -ENOSYS;
+ }
+
+ if (req->rq_export &&
+ lustre_msg_get_flags(req->rq_reqmsg) &
+ (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+ /* During recovery, we don't want to send too many early
+ * replies, but on the other hand we want to make sure the
+ * client has enough time to resend if the rpc is lost. So
+ * during the recovery period send at least 4 early replies,
+ * spacing them every at_extra if we can. at_estimate should
+ * always equal this fixed value during recovery. */
+ at_measured(&svcpt->scp_at_estimate, min(at_extra,
+ req->rq_export->exp_obd->obd_recovery_timeout / 4));
+ } else {
+ /* Fake our processing time into the future to ask the clients
+ * for some extra amount of time */
+ at_measured(&svcpt->scp_at_estimate, at_extra +
+ get_seconds() -
+ req->rq_arrival_time.tv_sec);
+
+ /* Check to see if we've actually increased the deadline -
+ * we may be past adaptive_max */
+ if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+ at_get(&svcpt->scp_at_estimate)) {
+ DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%ld), not sending early reply\n",
+ olddl, req->rq_arrival_time.tv_sec +
+ at_get(&svcpt->scp_at_estimate) -
+ get_seconds());
+ return -ETIMEDOUT;
+ }
+ }
+ newdl = get_seconds() + at_get(&svcpt->scp_at_estimate);
+
+ reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS);
+ if (reqcopy == NULL)
+ return -ENOMEM;
+ OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+ if (!reqmsg) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ *reqcopy = *req;
+ reqcopy->rq_reply_state = NULL;
+ reqcopy->rq_rep_swab_mask = 0;
+ reqcopy->rq_pack_bulk = 0;
+ reqcopy->rq_pack_udesc = 0;
+ reqcopy->rq_packed_final = 0;
+ sptlrpc_svc_ctx_addref(reqcopy);
+ /* We only need the reqmsg for the magic */
+ reqcopy->rq_reqmsg = reqmsg;
+ memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+ LASSERT(atomic_read(&req->rq_refcount));
+ /** if it is last refcount then early reply isn't needed */
+ if (atomic_read(&req->rq_refcount) == 1) {
+ DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, abort sending early reply\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* Connection ref */
+ reqcopy->rq_export = class_conn2export(
+ lustre_msg_get_handle(reqcopy->rq_reqmsg));
+ if (reqcopy->rq_export == NULL) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ /* RPC ref */
+ class_export_rpc_inc(reqcopy->rq_export);
+ if (reqcopy->rq_export->exp_obd &&
+ reqcopy->rq_export->exp_obd->obd_fail) {
+ rc = -ENODEV;
+ goto out_put;
+ }
+
+ rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+ if (rc)
+ goto out_put;
+
+ rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+ if (!rc) {
+ /* Adjust our own deadline to what we told the client */
+ req->rq_deadline = newdl;
+ req->rq_early_count++; /* number sent, server side */
+ } else {
+ DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+ }
+
+ /* Free the (early) reply state from lustre_pack_reply.
+ (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+ ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+ class_export_rpc_dec(reqcopy->rq_export);
+ class_export_put(reqcopy->rq_export);
+out:
+ sptlrpc_svc_ctx_decref(reqcopy);
+ OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+out_free:
+ ptlrpc_request_cache_free(reqcopy);
+ return rc;
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+ asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+ struct ptlrpc_request *rq, *n;
+ struct list_head work_list;
+ __u32 index, count;
+ time_t deadline;
+ time_t now = get_seconds();
+ long delay;
+ int first, counter = 0;
+
+ spin_lock(&svcpt->scp_at_lock);
+ if (svcpt->scp_at_check == 0) {
+ spin_unlock(&svcpt->scp_at_lock);
+ return 0;
+ }
+ delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+ svcpt->scp_at_check = 0;
+
+ if (array->paa_count == 0) {
+ spin_unlock(&svcpt->scp_at_lock);
+ return 0;
+ }
+
+ /* The timer went off, but maybe the nearest rpc already completed. */
+ first = array->paa_deadline - now;
+ if (first > at_early_margin) {
+ /* We've still got plenty of time. Reset the timer. */
+ ptlrpc_at_set_timer(svcpt);
+ spin_unlock(&svcpt->scp_at_lock);
+ return 0;
+ }
+
+ /* We're close to a timeout, and we don't know how much longer the
+ server will take. Send early replies to everyone expiring soon. */
+ INIT_LIST_HEAD(&work_list);
+ deadline = -1;
+ index = (unsigned long)array->paa_deadline % array->paa_size;
+ count = array->paa_count;
+ while (count > 0) {
+ count -= array->paa_reqs_count[index];
+ list_for_each_entry_safe(rq, n,
+ &array->paa_reqs_array[index],
+ rq_timed_list) {
+ if (rq->rq_deadline > now + at_early_margin) {
+ /* update the earliest deadline */
+ if (deadline == -1 ||
+ rq->rq_deadline < deadline)
+ deadline = rq->rq_deadline;
+ break;
+ }
+
+ ptlrpc_at_remove_timed(rq);
+ /**
+ * ptlrpc_server_drop_request() may drop
+ * refcount to 0 already. Let's check this and
+ * don't add entry to work_list
+ */
+ if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+ list_add(&rq->rq_timed_list, &work_list);
+ counter++;
+ }
+
+ if (++index >= array->paa_size)
+ index = 0;
+ }
+ array->paa_deadline = deadline;
+ /* we have a new earliest deadline, restart the timer */
+ ptlrpc_at_set_timer(svcpt);
+
+ spin_unlock(&svcpt->scp_at_lock);
+
+ CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early replies\n",
+ first, at_extra, counter);
+ if (first < 0) {
+ /* We're already past request deadlines before we even get a
+ chance to send early replies */
+ LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n",
+ svcpt->scp_service->srv_name);
+ CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=" CFS_DURATION_T "(jiff)\n",
+ counter, svcpt->scp_nreqs_incoming,
+ svcpt->scp_nreqs_active,
+ at_get(&svcpt->scp_at_estimate), delay);
+ }
+
+ /* we took additional refcount so entries can't be deleted from list, no
+ * locking is needed */
+ while (!list_empty(&work_list)) {
+ rq = list_entry(work_list.next, struct ptlrpc_request,
+ rq_timed_list);
+ list_del_init(&rq->rq_timed_list);
+
+ if (ptlrpc_at_send_early_reply(rq) == 0)
+ ptlrpc_at_add_timed(rq);
+
+ ptlrpc_server_drop_request(rq);
+ }
+
+ return 1; /* return "did_something" for liblustre */
+}
+
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req)
+{
+ int rc = 0;
+
+ if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
+ rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+ if (rc < 0)
+ return rc;
+ LASSERT(rc == 0);
+ }
+ if (req->rq_export && req->rq_ops) {
+ /* Perform request specific check. We should do this check
+ * before the request is added into exp_hp_rpcs list otherwise
+ * it may hit swab race at LU-1044. */
+ if (req->rq_ops->hpreq_check) {
+ rc = req->rq_ops->hpreq_check(req);
+ /**
+ * XXX: Out of all current
+ * ptlrpc_hpreq_ops::hpreq_check(), only
+ * ldlm_cancel_hpreq_check() can return an error code;
+ * other functions assert in similar places, which seems
+ * odd. What also does not seem right is that handlers
+ * for those RPCs do not assert on the same checks, but
+ * rather handle the error cases. e.g. see
+ * ost_rw_hpreq_check(), and ost_brw_read(),
+ * ost_brw_write().
+ */
+ if (rc < 0)
+ return rc;
+ LASSERT(rc == 0 || rc == 1);
+ }
+
+ spin_lock_bh(&req->rq_export->exp_rpc_lock);
+ list_add(&req->rq_exp_list,
+ &req->rq_export->exp_hp_rpcs);
+ spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+ }
+
+ ptlrpc_nrs_req_initialize(svcpt, req, rc);
+
+ return rc;
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+ if (req->rq_export && req->rq_ops) {
+ /* refresh lock timeout again so that client has more
+ * room to send lock cancel RPC. */
+ if (req->rq_ops->hpreq_fini)
+ req->rq_ops->hpreq_fini(req);
+
+ spin_lock_bh(&req->rq_export->exp_rpc_lock);
+ list_del_init(&req->rq_exp_list);
+ spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+ }
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+ return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+ .hpreq_check = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+ int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+ /* Check for export to let only reconnects for not yet evicted
+ * export to become a HP rpc. */
+ if ((req->rq_export != NULL) &&
+ (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+ req->rq_ops = &ptlrpc_hpreq_common;
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_request *req)
+{
+ int rc;
+
+ rc = ptlrpc_server_hpreq_init(svcpt, req);
+ if (rc < 0)
+ return rc;
+
+ ptlrpc_nrs_req_add(svcpt, req, !!rc);
+
+ return 0;
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+ bool force)
+{
+ int running = svcpt->scp_nthrs_running;
+
+ if (!nrs_svcpt_has_hp(svcpt))
+ return false;
+
+ if (force)
+ return true;
+
+ if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+ CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+ /* leave just 1 thread for normal RPCs */
+ running = PTLRPC_NTHRS_INIT;
+ if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+ running += 1;
+ }
+
+ if (svcpt->scp_nreqs_active >= running - 1)
+ return false;
+
+ if (svcpt->scp_nhreqs_active == 0)
+ return true;
+
+ return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+ svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+ bool force)
+{
+ return ptlrpc_server_allow_high(svcpt, force) &&
+ ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+ bool force)
+{
+ int running = svcpt->scp_nthrs_running;
+ if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+ CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+ /* leave just 1 thread for normal RPCs */
+ running = PTLRPC_NTHRS_INIT;
+ if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+ running += 1;
+ }
+
+ if (force ||
+ svcpt->scp_nreqs_active < running - 2)
+ return true;
+
+ if (svcpt->scp_nreqs_active >= running - 1)
+ return false;
+
+ return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+ bool force)
+{
+ return ptlrpc_server_allow_normal(svcpt, force) &&
+ ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline bool
+ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force)
+{
+ return ptlrpc_server_high_pending(svcpt, force) ||
+ ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+ struct ptlrpc_request *req = NULL;
+
+ spin_lock(&svcpt->scp_req_lock);
+
+ if (ptlrpc_server_high_pending(svcpt, force)) {
+ req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+ if (req != NULL) {
+ svcpt->scp_hreq_count++;
+ goto got_request;
+ }
+ }
+
+ if (ptlrpc_server_normal_pending(svcpt, force)) {
+ req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+ if (req != NULL) {
+ svcpt->scp_hreq_count = 0;
+ goto got_request;
+ }
+ }
+
+ spin_unlock(&svcpt->scp_req_lock);
+ return NULL;
+
+got_request:
+ svcpt->scp_nreqs_active++;
+ if (req->rq_hp)
+ svcpt->scp_nhreqs_active++;
+
+ spin_unlock(&svcpt->scp_req_lock);
+
+ if (likely(req->rq_export))
+ class_export_rpc_inc(req->rq_export);
+
+ return req;
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_thread *thread)
+{
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ struct ptlrpc_request *req;
+ __u32 deadline;
+ int rc;
+
+ spin_lock(&svcpt->scp_lock);
+ if (list_empty(&svcpt->scp_req_incoming)) {
+ spin_unlock(&svcpt->scp_lock);
+ return 0;
+ }
+
+ req = list_entry(svcpt->scp_req_incoming.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init(&req->rq_list);
+ svcpt->scp_nreqs_incoming--;
+ /* Consider this still a "queued" request as far as stats are
+ * concerned */
+ spin_unlock(&svcpt->scp_lock);
+
+ /* go through security check/transform */
+ rc = sptlrpc_svc_unwrap_request(req);
+ switch (rc) {
+ case SECSVC_OK:
+ break;
+ case SECSVC_COMPLETE:
+ target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+ goto err_req;
+ case SECSVC_DROP:
+ goto err_req;
+ default:
+ LBUG();
+ }
+
+ /*
+ * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+ * redo it wouldn't be harmful.
+ */
+ if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+ rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+ if (rc != 0) {
+ CERROR("error unpacking request: ptl %d from %s x%llu\n",
+ svc->srv_req_portal, libcfs_id2str(req->rq_peer),
+ req->rq_xid);
+ goto err_req;
+ }
+ }
+
+ rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+ if (rc) {
+ CERROR("error unpacking ptlrpc body: ptl %d from %s x%llu\n",
+ svc->srv_req_portal, libcfs_id2str(req->rq_peer),
+ req->rq_xid);
+ goto err_req;
+ }
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+ lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) {
+ CERROR("drop incoming rpc opc %u, x%llu\n",
+ cfs_fail_val, req->rq_xid);
+ goto err_req;
+ }
+
+ rc = -EINVAL;
+ if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+ CERROR("wrong packet type received (type=%u) from %s\n",
+ lustre_msg_get_type(req->rq_reqmsg),
+ libcfs_id2str(req->rq_peer));
+ goto err_req;
+ }
+
+ switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+ case MDS_WRITEPAGE:
+ case OST_WRITE:
+ req->rq_bulk_write = 1;
+ break;
+ case MDS_READPAGE:
+ case OST_READ:
+ case MGS_CONFIG_READ:
+ req->rq_bulk_read = 1;
+ break;
+ }
+
+ CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid);
+
+ req->rq_export = class_conn2export(
+ lustre_msg_get_handle(req->rq_reqmsg));
+ if (req->rq_export) {
+ rc = ptlrpc_check_req(req);
+ if (rc == 0) {
+ rc = sptlrpc_target_export_check(req->rq_export, req);
+ if (rc)
+ DEBUG_REQ(D_ERROR, req, "DROPPING req with illegal security flavor,");
+ }
+
+ if (rc)
+ goto err_req;
+ ptlrpc_update_export_timer(req->rq_export, 0);
+ }
+
+ /* req_in handling should/must be fast */
+ if (get_seconds() - req->rq_arrival_time.tv_sec > 5)
+ DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s",
+ cfs_time_sub(get_seconds(),
+ req->rq_arrival_time.tv_sec));
+
+ /* Set rpc server deadline and add it to the timed list */
+ deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+ MSGHDR_AT_SUPPORT) ?
+ /* The max time the client expects us to take */
+ lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+ req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+ if (unlikely(deadline == 0)) {
+ DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+ goto err_req;
+ }
+
+ req->rq_svc_thread = thread;
+
+ ptlrpc_at_add_timed(req);
+
+ /* Move it over to the request processing queue */
+ rc = ptlrpc_server_request_add(svcpt, req);
+ if (rc)
+ goto err_req;
+
+ wake_up(&svcpt->scp_waitq);
+ return 1;
+
+err_req:
+ ptlrpc_server_finish_request(svcpt, req);
+
+ return 1;
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int
+ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_thread *thread)
+{
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ struct ptlrpc_request *request;
+ struct timeval work_start;
+ struct timeval work_end;
+ long timediff;
+ int rc;
+ int fail_opc = 0;
+
+ request = ptlrpc_server_request_get(svcpt, false);
+ if (request == NULL)
+ return 0;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+ fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+ else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+ fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+ if (unlikely(fail_opc)) {
+ if (request->rq_export && request->rq_ops)
+ OBD_FAIL_TIMEOUT(fail_opc, 4);
+ }
+
+ ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+ libcfs_debug_dumplog();
+
+ do_gettimeofday(&work_start);
+ timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,
+ NULL);
+ if (likely(svc->srv_stats != NULL)) {
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+ timediff);
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+ svcpt->scp_nreqs_incoming);
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+ svcpt->scp_nreqs_active);
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+ at_get(&svcpt->scp_at_estimate));
+ }
+
+ rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF);
+ if (rc) {
+ CERROR("Failure to initialize session: %d\n", rc);
+ goto out_req;
+ }
+ request->rq_session.lc_thread = thread;
+ request->rq_session.lc_cookie = 0x5;
+ lu_context_enter(&request->rq_session);
+
+ CDEBUG(D_NET, "got req %llu\n", request->rq_xid);
+
+ request->rq_svc_thread = thread;
+ if (thread)
+ request->rq_svc_thread->t_env->le_ses = &request->rq_session;
+
+ if (likely(request->rq_export)) {
+ if (unlikely(ptlrpc_check_req(request)))
+ goto put_conn;
+ ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
+ }
+
+ /* Discard requests queued for longer than the deadline.
+ The deadline is increased if we send an early reply. */
+ if (get_seconds() > request->rq_deadline) {
+ DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline " CFS_DURATION_T ":" CFS_DURATION_T "s ago\n",
+ libcfs_id2str(request->rq_peer),
+ cfs_time_sub(request->rq_deadline,
+ request->rq_arrival_time.tv_sec),
+ cfs_time_sub(get_seconds(),
+ request->rq_deadline));
+ goto put_conn;
+ }
+
+ CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d\n",
+ current_comm(),
+ (request->rq_export ?
+ (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+ (request->rq_export ?
+ atomic_read(&request->rq_export->exp_refcount) : -99),
+ lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+ libcfs_id2str(request->rq_peer),
+ lustre_msg_get_opc(request->rq_reqmsg));
+
+ if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+ CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+ rc = svc->srv_ops.so_req_handler(request);
+
+ ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+ lu_context_exit(&request->rq_session);
+ lu_context_fini(&request->rq_session);
+
+ if (unlikely(get_seconds() > request->rq_deadline)) {
+ DEBUG_REQ(D_WARNING, request,
+ "Request took longer than estimated ("
+ CFS_DURATION_T":"CFS_DURATION_T
+ "s); client may timeout.",
+ cfs_time_sub(request->rq_deadline,
+ request->rq_arrival_time.tv_sec),
+ cfs_time_sub(get_seconds(),
+ request->rq_deadline));
+ }
+
+ do_gettimeofday(&work_end);
+ timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+ CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request processed in %ldus (%ldus total) trans %llu rc %d/%d\n",
+ current_comm(),
+ (request->rq_export ?
+ (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+ (request->rq_export ?
+ atomic_read(&request->rq_export->exp_refcount) : -99),
+ lustre_msg_get_status(request->rq_reqmsg),
+ request->rq_xid,
+ libcfs_id2str(request->rq_peer),
+ lustre_msg_get_opc(request->rq_reqmsg),
+ timediff,
+ cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+ (request->rq_repmsg ?
+ lustre_msg_get_transno(request->rq_repmsg) :
+ request->rq_transno),
+ request->rq_status,
+ (request->rq_repmsg ?
+ lustre_msg_get_status(request->rq_repmsg) : -999));
+ if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+ __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+ int opc = opcode_offset(op);
+ if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+ LASSERT(opc < LUSTRE_MAX_OPCODES);
+ lprocfs_counter_add(svc->srv_stats,
+ opc + EXTRA_MAX_OPCODES,
+ timediff);
+ }
+ }
+ if (unlikely(request->rq_early_count)) {
+ DEBUG_REQ(D_ADAPTTO, request,
+ "sent %d early replies before finishing in "
+ CFS_DURATION_T"s",
+ request->rq_early_count,
+ cfs_time_sub(work_end.tv_sec,
+ request->rq_arrival_time.tv_sec));
+ }
+
+out_req:
+ ptlrpc_server_finish_active_request(svcpt, request);
+
+ return 1;
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int
+ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+ struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ struct obd_export *exp;
+ int nlocks;
+ int been_handled;
+
+ exp = rs->rs_export;
+
+ LASSERT(rs->rs_difficult);
+ LASSERT(rs->rs_scheduled);
+ LASSERT(list_empty(&rs->rs_list));
+
+ spin_lock(&exp->exp_lock);
+ /* Noop if removed already */
+ list_del_init(&rs->rs_exp_list);
+ spin_unlock(&exp->exp_lock);
+
+ /* The disk commit callback holds exp_uncommitted_replies_lock while it
+ * iterates over newly committed replies, removing them from
+ * exp_uncommitted_replies. It then drops this lock and schedules the
+ * replies it found for handling here.
+ *
+ * We can avoid contention for exp_uncommitted_replies_lock between the
+ * HRT threads and further commit callbacks by checking rs_committed
+ * which is set in the commit callback while it holds both
+ * rs_lock and exp_uncommitted_reples.
+ *
+ * If we see rs_committed clear, the commit callback _may_ not have
+ * handled this reply yet and we race with it to grab
+ * exp_uncommitted_replies_lock before removing the reply from
+ * exp_uncommitted_replies. Note that if we lose the race and the
+ * reply has already been removed, list_del_init() is a noop.
+ *
+ * If we see rs_committed set, we know the commit callback is handling,
+ * or has handled this reply since store reordering might allow us to
+ * see rs_committed set out of sequence. But since this is done
+ * holding rs_lock, we can be sure it has all completed once we hold
+ * rs_lock, which we do right next.
+ */
+ if (!rs->rs_committed) {
+ spin_lock(&exp->exp_uncommitted_replies_lock);
+ list_del_init(&rs->rs_obd_list);
+ spin_unlock(&exp->exp_uncommitted_replies_lock);
+ }
+
+ spin_lock(&rs->rs_lock);
+
+ been_handled = rs->rs_handled;
+ rs->rs_handled = 1;
+
+ nlocks = rs->rs_nlocks; /* atomic "steal", but */
+ rs->rs_nlocks = 0; /* locks still on rs_locks! */
+
+ if (nlocks == 0 && !been_handled) {
+ /* If we see this, we should already have seen the warning
+ * in mds_steal_ack_locks() */
+ CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n",
+ rs,
+ rs->rs_xid, rs->rs_transno, rs->rs_opc,
+ libcfs_nid2str(exp->exp_connection->c_peer.nid));
+ }
+
+ if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+ spin_unlock(&rs->rs_lock);
+
+ if (!been_handled && rs->rs_on_net) {
+ LNetMDUnlink(rs->rs_md_h);
+ /* Ignore return code; we're racing with completion */
+ }
+
+ while (nlocks-- > 0)
+ ldlm_lock_decref(&rs->rs_locks[nlocks],
+ rs->rs_modes[nlocks]);
+
+ spin_lock(&rs->rs_lock);
+ }
+
+ rs->rs_scheduled = 0;
+
+ if (!rs->rs_on_net) {
+ /* Off the net */
+ spin_unlock(&rs->rs_lock);
+
+ class_export_put(exp);
+ rs->rs_export = NULL;
+ ptlrpc_rs_decref(rs);
+ if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+ svc->srv_is_stopping)
+ wake_up_all(&svcpt->scp_waitq);
+ return 1;
+ }
+
+ /* still on the net; callback will schedule */
+ spin_unlock(&rs->rs_lock);
+ return 1;
+}
+
+
+static void
+ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+ int avail = svcpt->scp_nrqbds_posted;
+ int low_water = test_req_buffer_pressure ? 0 :
+ svcpt->scp_service->srv_nbuf_per_group / 2;
+
+ /* NB I'm not locking; just looking. */
+
+ /* CAVEAT EMPTOR: We might be allocating buffers here because we've
+ * allowed the request history to grow out of control. We could put a
+ * sanity check on that here and cull some history if we need the
+ * space. */
+
+ if (avail <= low_water)
+ ptlrpc_grow_req_bufs(svcpt, 1);
+
+ if (svcpt->scp_service->srv_stats) {
+ lprocfs_counter_add(svcpt->scp_service->srv_stats,
+ PTLRPC_REQBUF_AVAIL_CNTR, avail);
+ }
+}
+
+static int
+ptlrpc_retry_rqbds(void *arg)
+{
+ struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg;
+
+ svcpt->scp_rqbd_timeout = 0;
+ return -ETIMEDOUT;
+}
+
+static inline int
+ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+ return svcpt->scp_nreqs_active <
+ svcpt->scp_nthrs_running - 1 -
+ (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+ return svcpt->scp_nthrs_running +
+ svcpt->scp_nthrs_starting <
+ svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int
+ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+ return !ptlrpc_threads_enough(svcpt) &&
+ ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int
+ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+ return thread_is_stopping(thread) ||
+ thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+static inline int
+ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+ return !list_empty(&svcpt->scp_rqbd_idle) &&
+ svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+ return svcpt->scp_at_check;
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+ return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+ struct ptlrpc_thread *thread)
+{
+ /* Don't exit while there are replies to be handled */
+ struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout,
+ ptlrpc_retry_rqbds, svcpt);
+
+ /* XXX: Add this back when libcfs watchdog is merged upstream
+ lc_watchdog_disable(thread->t_watchdog);
+ */
+
+ cond_resched();
+
+ l_wait_event_exclusive_head(svcpt->scp_waitq,
+ ptlrpc_thread_stopping(thread) ||
+ ptlrpc_server_request_incoming(svcpt) ||
+ ptlrpc_server_request_pending(svcpt, false) ||
+ ptlrpc_rqbd_pending(svcpt) ||
+ ptlrpc_at_check(svcpt), &lwi);
+
+ if (ptlrpc_thread_stopping(thread))
+ return -EINTR;
+
+ /*
+ lc_watchdog_touch(thread->t_watchdog,
+ ptlrpc_server_get_timeout(svcpt));
+ */
+ return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+ struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+ struct ptlrpc_service_part *svcpt = thread->t_svcpt;
+ struct ptlrpc_service *svc = svcpt->scp_service;
+ struct ptlrpc_reply_state *rs;
+ struct group_info *ginfo = NULL;
+ struct lu_env *env;
+ int counter = 0, rc = 0;
+
+ thread->t_pid = current_pid();
+ unshare_fs_struct();
+
+ /* NB: we will call cfs_cpt_bind() for all threads, because we
+ * might want to run lustre server only on a subset of system CPUs,
+ * in that case ->scp_cpt is CFS_CPT_ANY */
+ rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+ if (rc != 0) {
+ CWARN("%s: failed to bind %s on CPT %d\n",
+ svc->srv_name, thread->t_name, svcpt->scp_cpt);
+ }
+
+ ginfo = groups_alloc(0);
+ if (!ginfo) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ set_current_groups(ginfo);
+ put_group_info(ginfo);
+
+ if (svc->srv_ops.so_thr_init != NULL) {
+ rc = svc->srv_ops.so_thr_init(thread);
+ if (rc)
+ goto out;
+ }
+
+ OBD_ALLOC_PTR(env);
+ if (env == NULL) {
+ rc = -ENOMEM;
+ goto out_srv_fini;
+ }
+
+ rc = lu_context_init(&env->le_ctx,
+ svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+ if (rc)
+ goto out_srv_fini;
+
+ thread->t_env = env;
+ env->le_ctx.lc_thread = thread;
+ env->le_ctx.lc_cookie = 0x6;
+
+ while (!list_empty(&svcpt->scp_rqbd_idle)) {
+ rc = ptlrpc_server_post_idle_rqbds(svcpt);
+ if (rc >= 0)
+ continue;
+
+ CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+ svc->srv_name, svcpt->scp_cpt, rc);
+ goto out_srv_fini;
+ }
+
+ /* Alloc reply state structure for this one */
+ OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+ if (!rs) {
+ rc = -ENOMEM;
+ goto out_srv_fini;
+ }
+
+ spin_lock(&svcpt->scp_lock);
+
+ LASSERT(thread_is_starting(thread));
+ thread_clear_flags(thread, SVC_STARTING);
+
+ LASSERT(svcpt->scp_nthrs_starting == 1);
+ svcpt->scp_nthrs_starting--;
+
+ /* SVC_STOPPING may already be set here if someone else is trying
+ * to stop the service while this new thread has been dynamically
+ * forked. We still set SVC_RUNNING to let our creator know that
+ * we are now running, however we will exit as soon as possible */
+ thread_add_flags(thread, SVC_RUNNING);
+ svcpt->scp_nthrs_running++;
+ spin_unlock(&svcpt->scp_lock);
+
+ /* wake up our creator in case he's still waiting. */
+ wake_up(&thread->t_ctl_waitq);
+
+ /*
+ thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt),
+ NULL, NULL);
+ */
+
+ spin_lock(&svcpt->scp_rep_lock);
+ list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+ wake_up(&svcpt->scp_rep_waitq);
+ spin_unlock(&svcpt->scp_rep_lock);
+
+ CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+ svcpt->scp_nthrs_running);
+
+ /* XXX maintain a list of all managed devices: insert here */
+ while (!ptlrpc_thread_stopping(thread)) {
+ if (ptlrpc_wait_event(svcpt, thread))
+ break;
+
+ ptlrpc_check_rqbd_pool(svcpt);
+
+ if (ptlrpc_threads_need_create(svcpt)) {
+ /* Ignore return code - we tried... */
+ ptlrpc_start_thread(svcpt, 0);
+ }
+
+ /* Process all incoming reqs before handling any */
+ if (ptlrpc_server_request_incoming(svcpt)) {
+ lu_context_enter(&env->le_ctx);
+ env->le_ses = NULL;
+ ptlrpc_server_handle_req_in(svcpt, thread);
+ lu_context_exit(&env->le_ctx);
+
+ /* but limit ourselves in case of flood */
+ if (counter++ < 100)
+ continue;
+ counter = 0;
+ }
+
+ if (ptlrpc_at_check(svcpt))
+ ptlrpc_at_check_timed(svcpt);
+
+ if (ptlrpc_server_request_pending(svcpt, false)) {
+ lu_context_enter(&env->le_ctx);
+ ptlrpc_server_handle_request(svcpt, thread);
+ lu_context_exit(&env->le_ctx);
+ }
+
+ if (ptlrpc_rqbd_pending(svcpt) &&
+ ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+ /* I just failed to repost request buffers.
+ * Wait for a timeout (unless something else
+ * happens) before I try again */
+ svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+ CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+ svcpt->scp_nrqbds_posted);
+ }
+ }
+
+ /*
+ lc_watchdog_delete(thread->t_watchdog);
+ thread->t_watchdog = NULL;
+ */
+
+out_srv_fini:
+ /*
+ * deconstruct service specific state created by ptlrpc_start_thread()
+ */
+ if (svc->srv_ops.so_thr_done != NULL)
+ svc->srv_ops.so_thr_done(thread);
+
+ if (env != NULL) {
+ lu_context_fini(&env->le_ctx);
+ OBD_FREE_PTR(env);
+ }
+out:
+ CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
+ thread, thread->t_pid, thread->t_id, rc);
+
+ spin_lock(&svcpt->scp_lock);
+ if (thread_test_and_clear_flags(thread, SVC_STARTING))
+ svcpt->scp_nthrs_starting--;
+
+ if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+ /* must know immediately */
+ svcpt->scp_nthrs_running--;
+ }
+
+ thread->t_id = rc;
+ thread_add_flags(thread, SVC_STOPPED);
+
+ wake_up(&thread->t_ctl_waitq);
+ spin_unlock(&svcpt->scp_lock);
+
+ return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+ struct list_head *replies)
+{
+ int result;
+
+ spin_lock(&hrt->hrt_lock);
+
+ list_splice_init(&hrt->hrt_queue, replies);
+ result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+ spin_unlock(&hrt->hrt_lock);
+ return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+ struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg;
+ struct ptlrpc_hr_partition *hrp = hrt->hrt_partition;
+ LIST_HEAD (replies);
+ char threadname[20];
+ int rc;
+
+ snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+ hrp->hrp_cpt, hrt->hrt_id);
+ unshare_fs_struct();
+
+ rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+ if (rc != 0) {
+ CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+ threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+ }
+
+ atomic_inc(&hrp->hrp_nstarted);
+ wake_up(&ptlrpc_hr.hr_waitq);
+
+ while (!ptlrpc_hr.hr_stopping) {
+ l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+ while (!list_empty(&replies)) {
+ struct ptlrpc_reply_state *rs;
+
+ rs = list_entry(replies.prev,
+ struct ptlrpc_reply_state,
+ rs_list);
+ list_del_init(&rs->rs_list);
+ ptlrpc_handle_rs(rs);
+ }
+ }
+
+ atomic_inc(&hrp->hrp_nstopped);
+ wake_up(&ptlrpc_hr.hr_waitq);
+
+ return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+ struct ptlrpc_hr_partition *hrp;
+ int i;
+ int j;
+
+ ptlrpc_hr.hr_stopping = 1;
+
+ cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+ if (hrp->hrp_thrs == NULL)
+ continue; /* uninitialized */
+ for (j = 0; j < hrp->hrp_nthrs; j++)
+ wake_up_all(&hrp->hrp_thrs[j].hrt_waitq);
+ }
+
+ cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+ if (hrp->hrp_thrs == NULL)
+ continue; /* uninitialized */
+ wait_event(ptlrpc_hr.hr_waitq,
+ atomic_read(&hrp->hrp_nstopped) ==
+ atomic_read(&hrp->hrp_nstarted));
+ }
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+ struct ptlrpc_hr_partition *hrp;
+ int i;
+ int j;
+
+ cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+ int rc = 0;
+
+ for (j = 0; j < hrp->hrp_nthrs; j++) {
+ struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+ rc = PTR_ERR(kthread_run(ptlrpc_hr_main,
+ &hrp->hrp_thrs[j],
+ "ptlrpc_hr%02d_%03d",
+ hrp->hrp_cpt,
+ hrt->hrt_id));
+ if (IS_ERR_VALUE(rc))
+ break;
+ }
+ wait_event(ptlrpc_hr.hr_waitq,
+ atomic_read(&hrp->hrp_nstarted) == j);
+ if (!IS_ERR_VALUE(rc))
+ continue;
+
+ CERROR("Reply handling thread %d:%d Failed on starting: rc = %d\n",
+ i, j, rc);
+ ptlrpc_stop_hr_threads();
+ return rc;
+ }
+ return 0;
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+ struct l_wait_info lwi = { 0 };
+ struct ptlrpc_thread *thread;
+ LIST_HEAD (zombie);
+
+ CDEBUG(D_INFO, "Stopping threads for service %s\n",
+ svcpt->scp_service->srv_name);
+
+ spin_lock(&svcpt->scp_lock);
+ /* let the thread know that we would like it to stop asap */
+ list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
+ CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+ svcpt->scp_service->srv_thread_name, thread->t_id);
+ thread_add_flags(thread, SVC_STOPPING);
+ }
+
+ wake_up_all(&svcpt->scp_waitq);
+
+ while (!list_empty(&svcpt->scp_threads)) {
+ thread = list_entry(svcpt->scp_threads.next,
+ struct ptlrpc_thread, t_link);
+ if (thread_is_stopped(thread)) {
+ list_del(&thread->t_link);
+ list_add(&thread->t_link, &zombie);
+ continue;
+ }
+ spin_unlock(&svcpt->scp_lock);
+
+ CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+ svcpt->scp_service->srv_thread_name, thread->t_id);
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_stopped(thread), &lwi);
+
+ spin_lock(&svcpt->scp_lock);
+ }
+
+ spin_unlock(&svcpt->scp_lock);
+
+ while (!list_empty(&zombie)) {
+ thread = list_entry(zombie.next,
+ struct ptlrpc_thread, t_link);
+ list_del(&thread->t_link);
+ OBD_FREE_PTR(thread);
+ }
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ int i;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if (svcpt->scp_service != NULL)
+ ptlrpc_svcpt_stop_threads(svcpt);
+ }
+}
+EXPORT_SYMBOL(ptlrpc_stop_all_threads);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+ int rc = 0;
+ int i;
+ int j;
+
+ /* We require 2 threads min, see note in ptlrpc_server_handle_request */
+ LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+ for (i = 0; i < svc->srv_ncpts; i++) {
+ for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+ rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+ if (rc == 0)
+ continue;
+
+ if (rc != -EMFILE)
+ goto failed;
+ /* We have enough threads, don't start more. b=15759 */
+ break;
+ }
+ }
+
+ return 0;
+ failed:
+ CERROR("cannot start %s thread #%d_%d: rc %d\n",
+ svc->srv_thread_name, i, j, rc);
+ ptlrpc_stop_all_threads(svc);
+ return rc;
+}
+EXPORT_SYMBOL(ptlrpc_start_threads);
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+ struct l_wait_info lwi = { 0 };
+ struct ptlrpc_thread *thread;
+ struct ptlrpc_service *svc;
+ int rc;
+
+ LASSERT(svcpt != NULL);
+
+ svc = svcpt->scp_service;
+
+ CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+ svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+ svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+ if (unlikely(svc->srv_is_stopping))
+ return -ESRCH;
+
+ if (!ptlrpc_threads_increasable(svcpt) ||
+ (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+ svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+ return -EMFILE;
+
+ OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+ if (thread == NULL)
+ return -ENOMEM;
+ init_waitqueue_head(&thread->t_ctl_waitq);
+
+ spin_lock(&svcpt->scp_lock);
+ if (!ptlrpc_threads_increasable(svcpt)) {
+ spin_unlock(&svcpt->scp_lock);
+ OBD_FREE_PTR(thread);
+ return -EMFILE;
+ }
+
+ if (svcpt->scp_nthrs_starting != 0) {
+ /* serialize starting because some modules (obdfilter)
+ * might require unique and contiguous t_id */
+ LASSERT(svcpt->scp_nthrs_starting == 1);
+ spin_unlock(&svcpt->scp_lock);
+ OBD_FREE_PTR(thread);
+ if (wait) {
+ CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+ svc->srv_thread_name, svcpt->scp_thr_nextid);
+ schedule();
+ goto again;
+ }
+
+ CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+ svc->srv_thread_name, svcpt->scp_thr_nextid);
+ return -EAGAIN;
+ }
+
+ svcpt->scp_nthrs_starting++;
+ thread->t_id = svcpt->scp_thr_nextid++;
+ thread_add_flags(thread, SVC_STARTING);
+ thread->t_svcpt = svcpt;
+
+ list_add(&thread->t_link, &svcpt->scp_threads);
+ spin_unlock(&svcpt->scp_lock);
+
+ if (svcpt->scp_cpt >= 0) {
+ snprintf(thread->t_name, sizeof(thread->t_name), "%s%02d_%03d",
+ svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+ } else {
+ snprintf(thread->t_name, sizeof(thread->t_name), "%s_%04d",
+ svc->srv_thread_name, thread->t_id);
+ }
+
+ CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+ rc = PTR_ERR(kthread_run(ptlrpc_main, thread, "%s", thread->t_name));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("cannot start thread '%s': rc %d\n",
+ thread->t_name, rc);
+ spin_lock(&svcpt->scp_lock);
+ --svcpt->scp_nthrs_starting;
+ if (thread_is_stopping(thread)) {
+ /* this ptlrpc_thread is being handled
+ * by ptlrpc_svcpt_stop_threads now
+ */
+ thread_add_flags(thread, SVC_STOPPED);
+ wake_up(&thread->t_ctl_waitq);
+ spin_unlock(&svcpt->scp_lock);
+ } else {
+ list_del(&thread->t_link);
+ spin_unlock(&svcpt->scp_lock);
+ OBD_FREE_PTR(thread);
+ }
+ return rc;
+ }
+
+ if (!wait)
+ return 0;
+
+ l_wait_event(thread->t_ctl_waitq,
+ thread_is_running(thread) || thread_is_stopped(thread),
+ &lwi);
+
+ rc = thread_is_stopped(thread) ? thread->t_id : 0;
+ return rc;
+}
+
+int ptlrpc_hr_init(void)
+{
+ struct ptlrpc_hr_partition *hrp;
+ struct ptlrpc_hr_thread *hrt;
+ int rc;
+ int i;
+ int j;
+ int weight;
+
+ memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+ ptlrpc_hr.hr_cpt_table = cfs_cpt_table;
+
+ ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+ sizeof(*hrp));
+ if (ptlrpc_hr.hr_partitions == NULL)
+ return -ENOMEM;
+
+ init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+ weight = cpumask_weight(topology_thread_cpumask(0));
+
+ cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+ hrp->hrp_cpt = i;
+
+ atomic_set(&hrp->hrp_nstarted, 0);
+ atomic_set(&hrp->hrp_nstopped, 0);
+
+ hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i);
+ hrp->hrp_nthrs /= weight;
+
+ LASSERT(hrp->hrp_nthrs > 0);
+ OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i,
+ hrp->hrp_nthrs * sizeof(*hrt));
+ if (hrp->hrp_thrs == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (j = 0; j < hrp->hrp_nthrs; j++) {
+ hrt = &hrp->hrp_thrs[j];
+
+ hrt->hrt_id = j;
+ hrt->hrt_partition = hrp;
+ init_waitqueue_head(&hrt->hrt_waitq);
+ spin_lock_init(&hrt->hrt_lock);
+ INIT_LIST_HEAD(&hrt->hrt_queue);
+ }
+ }
+
+ rc = ptlrpc_start_hr_threads();
+out:
+ if (rc != 0)
+ ptlrpc_hr_fini();
+ return rc;
+}
+
+void ptlrpc_hr_fini(void)
+{
+ struct ptlrpc_hr_partition *hrp;
+ int i;
+
+ if (ptlrpc_hr.hr_partitions == NULL)
+ return;
+
+ ptlrpc_stop_hr_threads();
+
+ cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+ if (hrp->hrp_thrs != NULL) {
+ OBD_FREE(hrp->hrp_thrs,
+ hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0]));
+ }
+ }
+
+ cfs_percpt_free(ptlrpc_hr.hr_partitions);
+ ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+ while (1) {
+ int rc;
+ struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10),
+ NULL, NULL);
+
+ rc = l_wait_event(svcpt->scp_waitq,
+ atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi);
+ if (rc == 0)
+ break;
+ CWARN("Unexpectedly long timeout %s %p\n",
+ svcpt->scp_service->srv_name, svcpt->scp_service);
+ }
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ int i;
+
+ /* early disarm AT timer... */
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if (svcpt->scp_service != NULL)
+ cfs_timer_disarm(&svcpt->scp_at_timer);
+ }
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ struct ptlrpc_request_buffer_desc *rqbd;
+ struct l_wait_info lwi;
+ int rc;
+ int i;
+
+ /* All history will be culled when the next request buffer is
+ * freed in ptlrpc_service_purge_all() */
+ svc->srv_hist_nrqbds_cpt_max = 0;
+
+ rc = LNetClearLazyPortal(svc->srv_req_portal);
+ LASSERT(rc == 0);
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if (svcpt->scp_service == NULL)
+ break;
+
+ /* Unlink all the request buffers. This forces a 'final'
+ * event with its 'unlink' flag set for each posted rqbd */
+ list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+ rqbd_list) {
+ rc = LNetMDUnlink(rqbd->rqbd_md_h);
+ LASSERT(rc == 0 || rc == -ENOENT);
+ }
+ }
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if (svcpt->scp_service == NULL)
+ break;
+
+ /* Wait for the network to release any buffers
+ * it's currently filling */
+ spin_lock(&svcpt->scp_lock);
+ while (svcpt->scp_nrqbds_posted != 0) {
+ spin_unlock(&svcpt->scp_lock);
+ /* Network access will complete in finite time but
+ * the HUGE timeout lets us CWARN for visibility
+ * of sluggish NALs */
+ lwi = LWI_TIMEOUT_INTERVAL(
+ cfs_time_seconds(LONG_UNLINK),
+ cfs_time_seconds(1), NULL, NULL);
+ rc = l_wait_event(svcpt->scp_waitq,
+ svcpt->scp_nrqbds_posted == 0, &lwi);
+ if (rc == -ETIMEDOUT) {
+ CWARN("Service %s waiting for request buffers\n",
+ svcpt->scp_service->srv_name);
+ }
+ spin_lock(&svcpt->scp_lock);
+ }
+ spin_unlock(&svcpt->scp_lock);
+ }
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ struct ptlrpc_request_buffer_desc *rqbd;
+ struct ptlrpc_request *req;
+ struct ptlrpc_reply_state *rs;
+ int i;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if (svcpt->scp_service == NULL)
+ break;
+
+ spin_lock(&svcpt->scp_rep_lock);
+ while (!list_empty(&svcpt->scp_rep_active)) {
+ rs = list_entry(svcpt->scp_rep_active.next,
+ struct ptlrpc_reply_state, rs_list);
+ spin_lock(&rs->rs_lock);
+ ptlrpc_schedule_difficult_reply(rs);
+ spin_unlock(&rs->rs_lock);
+ }
+ spin_unlock(&svcpt->scp_rep_lock);
+
+ /* purge the request queue. NB No new replies (rqbds
+ * all unlinked) and no service threads, so I'm the only
+ * thread noodling the request queue now */
+ while (!list_empty(&svcpt->scp_req_incoming)) {
+ req = list_entry(svcpt->scp_req_incoming.next,
+ struct ptlrpc_request, rq_list);
+
+ list_del(&req->rq_list);
+ svcpt->scp_nreqs_incoming--;
+ ptlrpc_server_finish_request(svcpt, req);
+ }
+
+ while (ptlrpc_server_request_pending(svcpt, true)) {
+ req = ptlrpc_server_request_get(svcpt, true);
+ ptlrpc_server_finish_active_request(svcpt, req);
+ }
+
+ LASSERT(list_empty(&svcpt->scp_rqbd_posted));
+ LASSERT(svcpt->scp_nreqs_incoming == 0);
+ LASSERT(svcpt->scp_nreqs_active == 0);
+ /* history should have been culled by
+ * ptlrpc_server_finish_request */
+ LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+ /* Now free all the request buffers since nothing
+ * references them any more... */
+
+ while (!list_empty(&svcpt->scp_rqbd_idle)) {
+ rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+ struct ptlrpc_request_buffer_desc,
+ rqbd_list);
+ ptlrpc_free_rqbd(rqbd);
+ }
+ ptlrpc_wait_replies(svcpt);
+
+ while (!list_empty(&svcpt->scp_rep_idle)) {
+ rs = list_entry(svcpt->scp_rep_idle.next,
+ struct ptlrpc_reply_state,
+ rs_list);
+ list_del(&rs->rs_list);
+ OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+ }
+ }
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ struct ptlrpc_at_array *array;
+ int i;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ if (svcpt->scp_service == NULL)
+ break;
+
+ /* In case somebody rearmed this in the meantime */
+ cfs_timer_disarm(&svcpt->scp_at_timer);
+ array = &svcpt->scp_at_array;
+
+ if (array->paa_reqs_array != NULL) {
+ OBD_FREE(array->paa_reqs_array,
+ sizeof(struct list_head) * array->paa_size);
+ array->paa_reqs_array = NULL;
+ }
+
+ if (array->paa_reqs_count != NULL) {
+ OBD_FREE(array->paa_reqs_count,
+ sizeof(__u32) * array->paa_size);
+ array->paa_reqs_count = NULL;
+ }
+ }
+
+ ptlrpc_service_for_each_part(svcpt, i, svc)
+ OBD_FREE_PTR(svcpt);
+
+ if (svc->srv_cpts != NULL)
+ cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+ OBD_FREE(svc, offsetof(struct ptlrpc_service,
+ srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+ CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+ service->srv_is_stopping = 1;
+
+ mutex_lock(&ptlrpc_all_services_mutex);
+ list_del_init(&service->srv_list);
+ mutex_unlock(&ptlrpc_all_services_mutex);
+
+ ptlrpc_service_del_atimer(service);
+ ptlrpc_stop_all_threads(service);
+
+ ptlrpc_service_unlink_rqbd(service);
+ ptlrpc_service_purge_all(service);
+ ptlrpc_service_nrs_cleanup(service);
+
+ ptlrpc_lprocfs_unregister_service(service);
+
+ ptlrpc_service_free(service);
+
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue. We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive. */
+int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+ struct ptlrpc_request *request = NULL;
+ struct timeval right_now;
+ long timediff;
+
+ do_gettimeofday(&right_now);
+
+ spin_lock(&svcpt->scp_req_lock);
+ /* How long has the next entry been waiting? */
+ if (ptlrpc_server_high_pending(svcpt, true))
+ request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+ else if (ptlrpc_server_normal_pending(svcpt, true))
+ request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+ if (request == NULL) {
+ spin_unlock(&svcpt->scp_req_lock);
+ return 0;
+ }
+
+ timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+ spin_unlock(&svcpt->scp_req_lock);
+
+ if ((timediff / ONE_MILLION) >
+ (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+ CERROR("%s: unhealthy - request has been waiting %lds\n",
+ svcpt->scp_service->srv_name, timediff / ONE_MILLION);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+ struct ptlrpc_service_part *svcpt;
+ int i;
+
+ if (svc == NULL)
+ return 0;
+
+ ptlrpc_service_for_each_part(svcpt, i, svc) {
+ int rc = ptlrpc_svcpt_health_check(svcpt);
+
+ if (rc != 0)
+ return rc;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
new file mode 100644
index 000000000..d6d92046c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
@@ -0,0 +1,4492 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+
+#include "../include/obd_support.h"
+#include "../include/obd_class.h"
+#include "../include/lustre_net.h"
+#include "../include/lustre_disk.h"
+void lustre_assert_wire_constants(void)
+{
+ /* Wire protocol assertions generated by 'wirecheck'
+ * (make -C lustre/utils newwiretest)
+ * running on Linux centos6-bis 2.6.32-358.0.1.el6-head
+ * #3 SMP Wed Apr 17 17:37:43 CEST 2013
+ * with gcc version 4.4.6 20110731 (Red Hat 4.4.6-3) (GCC)
+ */
+
+ /* Constants... */
+ LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+ (long long)PTL_RPC_MSG_REQUEST);
+ LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+ (long long)PTL_RPC_MSG_ERR);
+ LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+ (long long)PTL_RPC_MSG_REPLY);
+ LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+ MDS_DIR_END_OFF);
+ LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+ DEAD_HANDLE_MAGIC);
+ CLASSERT(MTI_NAME_MAXLEN == 64);
+ LASSERTF(OST_REPLY == 0, "found %lld\n",
+ (long long)OST_REPLY);
+ LASSERTF(OST_GETATTR == 1, "found %lld\n",
+ (long long)OST_GETATTR);
+ LASSERTF(OST_SETATTR == 2, "found %lld\n",
+ (long long)OST_SETATTR);
+ LASSERTF(OST_READ == 3, "found %lld\n",
+ (long long)OST_READ);
+ LASSERTF(OST_WRITE == 4, "found %lld\n",
+ (long long)OST_WRITE);
+ LASSERTF(OST_CREATE == 5, "found %lld\n",
+ (long long)OST_CREATE);
+ LASSERTF(OST_DESTROY == 6, "found %lld\n",
+ (long long)OST_DESTROY);
+ LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+ (long long)OST_GET_INFO);
+ LASSERTF(OST_CONNECT == 8, "found %lld\n",
+ (long long)OST_CONNECT);
+ LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+ (long long)OST_DISCONNECT);
+ LASSERTF(OST_PUNCH == 10, "found %lld\n",
+ (long long)OST_PUNCH);
+ LASSERTF(OST_OPEN == 11, "found %lld\n",
+ (long long)OST_OPEN);
+ LASSERTF(OST_CLOSE == 12, "found %lld\n",
+ (long long)OST_CLOSE);
+ LASSERTF(OST_STATFS == 13, "found %lld\n",
+ (long long)OST_STATFS);
+ LASSERTF(OST_SYNC == 16, "found %lld\n",
+ (long long)OST_SYNC);
+ LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+ (long long)OST_SET_INFO);
+ LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+ (long long)OST_QUOTACHECK);
+ LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+ (long long)OST_QUOTACTL);
+ LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+ (long long)OST_QUOTA_ADJUST_QUNIT);
+ LASSERTF(OST_LAST_OPC == 21, "found %lld\n",
+ (long long)OST_LAST_OPC);
+ LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+ OBD_OBJECT_EOF);
+ LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+ (long long)OST_MIN_PRECREATE);
+ LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+ (long long)OST_MAX_PRECREATE);
+ LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+ OST_LVB_ERR_INIT);
+ LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+ OST_LVB_ERR_MASK);
+ LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+ (long long)MDS_FIRST_OPC);
+ LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+ (long long)MDS_GETATTR);
+ LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+ (long long)MDS_GETATTR_NAME);
+ LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+ (long long)MDS_CLOSE);
+ LASSERTF(MDS_REINT == 36, "found %lld\n",
+ (long long)MDS_REINT);
+ LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+ (long long)MDS_READPAGE);
+ LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+ (long long)MDS_CONNECT);
+ LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+ (long long)MDS_DISCONNECT);
+ LASSERTF(MDS_GETSTATUS == 40, "found %lld\n",
+ (long long)MDS_GETSTATUS);
+ LASSERTF(MDS_STATFS == 41, "found %lld\n",
+ (long long)MDS_STATFS);
+ LASSERTF(MDS_PIN == 42, "found %lld\n",
+ (long long)MDS_PIN);
+ LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+ (long long)MDS_UNPIN);
+ LASSERTF(MDS_SYNC == 44, "found %lld\n",
+ (long long)MDS_SYNC);
+ LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+ (long long)MDS_DONE_WRITING);
+ LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+ (long long)MDS_SET_INFO);
+ LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+ (long long)MDS_QUOTACHECK);
+ LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+ (long long)MDS_QUOTACTL);
+ LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+ (long long)MDS_GETXATTR);
+ LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+ (long long)MDS_SETXATTR);
+ LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+ (long long)MDS_WRITEPAGE);
+ LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+ (long long)MDS_IS_SUBDIR);
+ LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+ (long long)MDS_GET_INFO);
+ LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+ (long long)MDS_HSM_STATE_GET);
+ LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+ (long long)MDS_HSM_STATE_SET);
+ LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+ (long long)MDS_HSM_ACTION);
+ LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+ (long long)MDS_HSM_PROGRESS);
+ LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+ (long long)MDS_HSM_REQUEST);
+ LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+ (long long)MDS_HSM_CT_REGISTER);
+ LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+ (long long)MDS_HSM_CT_UNREGISTER);
+ LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+ (long long)MDS_SWAP_LAYOUTS);
+ LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+ (long long)MDS_LAST_OPC);
+ LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+ (long long)REINT_SETATTR);
+ LASSERTF(REINT_CREATE == 2, "found %lld\n",
+ (long long)REINT_CREATE);
+ LASSERTF(REINT_LINK == 3, "found %lld\n",
+ (long long)REINT_LINK);
+ LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+ (long long)REINT_UNLINK);
+ LASSERTF(REINT_RENAME == 5, "found %lld\n",
+ (long long)REINT_RENAME);
+ LASSERTF(REINT_OPEN == 6, "found %lld\n",
+ (long long)REINT_OPEN);
+ LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+ (long long)REINT_SETXATTR);
+ LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+ (long long)REINT_RMENTRY);
+ LASSERTF(REINT_MAX == 9, "found %lld\n",
+ (long long)REINT_MAX);
+ LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_IT_EXECD);
+ LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_LOOKUP_EXECD);
+ LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_LOOKUP_NEG);
+ LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_LOOKUP_POS);
+ LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_OPEN_CREATE);
+ LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_OPEN_OPEN);
+ LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_ENQ_COMPLETE);
+ LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_ENQ_OPEN_REF);
+ LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_ENQ_CREATE_REF);
+ LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+ (unsigned)DISP_OPEN_LOCK);
+ LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+ (long long)MDS_STATUS_CONN);
+ LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+ (long long)MDS_STATUS_LOV);
+ LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n",
+ (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES);
+ LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)MF_SOM_CHANGE);
+ LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)MF_EPOCH_OPEN);
+ LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)MF_EPOCH_CLOSE);
+ LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n",
+ (unsigned)MF_MDC_CANCEL_FID1);
+ LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n",
+ (unsigned)MF_MDC_CANCEL_FID2);
+ LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n",
+ (unsigned)MF_MDC_CANCEL_FID3);
+ LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n",
+ (unsigned)MF_MDC_CANCEL_FID4);
+ LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n",
+ (unsigned)MF_SOM_AU);
+ LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n",
+ (unsigned)MF_GETATTR_LOCK);
+ LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_MODE);
+ LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_UID);
+ LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_GID);
+ LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_SIZE);
+ LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_ATIME);
+ LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_MTIME);
+ LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_CTIME);
+ LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_ATIME_SET);
+ LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_MTIME_SET);
+ LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_FORCE);
+ LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_ATTR_FLAG);
+ LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_KILL_SUID);
+ LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_KILL_SGID);
+ LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_CTIME_SET);
+ LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_FROM_OPEN);
+ LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+ (long long)MDS_ATTR_BLOCKS);
+ LASSERTF(FLD_QUERY == 900, "found %lld\n",
+ (long long)FLD_QUERY);
+ LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+ (long long)FLD_FIRST_OPC);
+ LASSERTF(FLD_LAST_OPC == 901, "found %lld\n",
+ (long long)FLD_LAST_OPC);
+ LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+ (long long)SEQ_QUERY);
+ LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+ (long long)SEQ_FIRST_OPC);
+ LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+ (long long)SEQ_LAST_OPC);
+ LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+ (long long)SEQ_ALLOC_SUPER);
+ LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+ (long long)SEQ_ALLOC_META);
+ LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+ (long long)LDLM_ENQUEUE);
+ LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+ (long long)LDLM_CONVERT);
+ LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+ (long long)LDLM_CANCEL);
+ LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+ (long long)LDLM_BL_CALLBACK);
+ LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+ (long long)LDLM_CP_CALLBACK);
+ LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+ (long long)LDLM_GL_CALLBACK);
+ LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+ (long long)LDLM_SET_INFO);
+ LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+ (long long)LDLM_LAST_OPC);
+ LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+ (long long)LCK_MINMODE);
+ LASSERTF(LCK_EX == 1, "found %lld\n",
+ (long long)LCK_EX);
+ LASSERTF(LCK_PW == 2, "found %lld\n",
+ (long long)LCK_PW);
+ LASSERTF(LCK_PR == 4, "found %lld\n",
+ (long long)LCK_PR);
+ LASSERTF(LCK_CW == 8, "found %lld\n",
+ (long long)LCK_CW);
+ LASSERTF(LCK_CR == 16, "found %lld\n",
+ (long long)LCK_CR);
+ LASSERTF(LCK_NL == 32, "found %lld\n",
+ (long long)LCK_NL);
+ LASSERTF(LCK_GROUP == 64, "found %lld\n",
+ (long long)LCK_GROUP);
+ LASSERTF(LCK_COS == 128, "found %lld\n",
+ (long long)LCK_COS);
+ LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+ (long long)LCK_MAXMODE);
+ LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+ (long long)LCK_MODE_NUM);
+ CLASSERT(LDLM_PLAIN == 10);
+ CLASSERT(LDLM_EXTENT == 11);
+ CLASSERT(LDLM_FLOCK == 12);
+ CLASSERT(LDLM_IBITS == 13);
+ CLASSERT(LDLM_MAX_TYPE == 14);
+ CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0);
+ CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1);
+ LASSERTF(UPDATE_OBJ == 1000, "found %lld\n",
+ (long long)UPDATE_OBJ);
+ LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n",
+ (long long)UPDATE_LAST_OPC);
+ CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2);
+ CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3);
+ CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3);
+ CLASSERT(LQUOTA_TYPE_USR == 0);
+ CLASSERT(LQUOTA_TYPE_GRP == 1);
+ CLASSERT(LQUOTA_RES_MD == 1);
+ CLASSERT(LQUOTA_RES_DT == 2);
+ LASSERTF(OBD_PING == 400, "found %lld\n",
+ (long long)OBD_PING);
+ LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+ (long long)OBD_LOG_CANCEL);
+ LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+ (long long)OBD_QC_CALLBACK);
+ LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+ (long long)OBD_IDX_READ);
+ LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+ (long long)OBD_LAST_OPC);
+ LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+ (long long)QUOTA_DQACQ);
+ LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+ (long long)QUOTA_DQREL);
+ LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+ (long long)QUOTA_LAST_OPC);
+ LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+ (long long)MGS_CONNECT);
+ LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+ (long long)MGS_DISCONNECT);
+ LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+ (long long)MGS_EXCEPTION);
+ LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+ (long long)MGS_TARGET_REG);
+ LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+ (long long)MGS_TARGET_DEL);
+ LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+ (long long)MGS_SET_INFO);
+ LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+ (long long)MGS_LAST_OPC);
+ LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+ (long long)SEC_CTX_INIT);
+ LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+ (long long)SEC_CTX_INIT_CONT);
+ LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+ (long long)SEC_CTX_FINI);
+ LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+ (long long)SEC_LAST_OPC);
+ /* Sizes and Offsets */
+
+ /* Checks for struct obd_uuid */
+ LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+ (long long)(int)sizeof(struct obd_uuid));
+
+ /* Checks for struct lu_seq_range */
+ LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct lu_seq_range));
+ LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+ LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+ LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+ LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+ LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+ LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+ LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+ LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+ LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+ (long long)LU_SEQ_RANGE_MDT);
+ LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+ (long long)LU_SEQ_RANGE_OST);
+
+ /* Checks for struct lustre_mdt_attrs */
+ LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct lustre_mdt_attrs));
+ LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+ LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+ LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+ LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+ LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+ LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+ LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)LMAI_RELEASED);
+ LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)LMAC_HSM);
+ LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)LMAC_SOM);
+ LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)LMAC_NOT_IN_OI);
+ LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n",
+ (unsigned)LMAC_FID_ON_OST);
+ LASSERTF(OBJ_CREATE == 1, "found %lld\n",
+ (long long)OBJ_CREATE);
+ LASSERTF(OBJ_DESTROY == 2, "found %lld\n",
+ (long long)OBJ_DESTROY);
+ LASSERTF(OBJ_REF_ADD == 3, "found %lld\n",
+ (long long)OBJ_REF_ADD);
+ LASSERTF(OBJ_REF_DEL == 4, "found %lld\n",
+ (long long)OBJ_REF_DEL);
+ LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n",
+ (long long)OBJ_ATTR_SET);
+ LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n",
+ (long long)OBJ_ATTR_GET);
+ LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n",
+ (long long)OBJ_XATTR_SET);
+ LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n",
+ (long long)OBJ_XATTR_GET);
+ LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+ (long long)OBJ_INDEX_LOOKUP);
+ LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+ (long long)OBJ_INDEX_LOOKUP);
+ LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n",
+ (long long)OBJ_INDEX_INSERT);
+ LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n",
+ (long long)OBJ_INDEX_DELETE);
+
+ /* Checks for struct ost_id */
+ LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+ (long long)(int)sizeof(struct ost_id));
+ LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ost_id, oi));
+ LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_id *)0)->oi));
+ LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+ (long long)LUSTRE_FID_INIT_OID);
+ LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+ (long long)FID_SEQ_OST_MDT0);
+ LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+ (long long)FID_SEQ_LLOG);
+ LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+ (long long)FID_SEQ_ECHO);
+ LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n",
+ (long long)FID_SEQ_OST_MDT1);
+ LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n",
+ (long long)FID_SEQ_OST_MAX);
+ LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+ (long long)FID_SEQ_RSVD);
+ LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+ (long long)FID_SEQ_IGIF);
+ LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_IGIF_MAX);
+ LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_IDIF);
+ LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_IDIF_MAX);
+ LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_START);
+ LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_LOCAL_FILE);
+ LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_DOT_LUSTRE);
+ LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_SPECIAL);
+ LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_QUOTA);
+ LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_QUOTA_GLB);
+ LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_ROOT);
+ LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_NORMAL);
+ LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+ (long long)FID_SEQ_LOV_DEFAULT);
+ LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)FID_OID_SPECIAL_BFL);
+ LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)FID_OID_DOT_LUSTRE);
+ LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)FID_OID_DOT_LUSTRE_OBF);
+
+ /* Checks for struct lu_dirent */
+ LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct lu_dirent));
+ LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirent, lde_fid));
+ LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+ LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirent, lde_hash));
+ LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+ LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+ LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+ LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+ LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+ LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+ LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+ LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+ LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+ LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)LUDA_FID);
+ LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)LUDA_TYPE);
+ LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)LUDA_64BITHASH);
+
+ /* Checks for struct luda_type */
+ LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+ (long long)(int)sizeof(struct luda_type));
+ LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct luda_type, lt_type));
+ LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+ /* Checks for struct lu_dirpage */
+ LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct lu_dirpage));
+ LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+ LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+ LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+ LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+ LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+ LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+ LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+ LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+ LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+ LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+ LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+ (long long)LDF_EMPTY);
+ LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+ (long long)LDF_COLLIDE);
+ LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+ (long long)LU_PAGE_SIZE);
+ /* Checks for union lu_page */
+ LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+ (long long)(int)sizeof(union lu_page));
+
+ /* Checks for struct lustre_handle */
+ LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+ (long long)(int)sizeof(struct lustre_handle));
+ LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_handle, cookie));
+ LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+ /* Checks for struct lustre_msg_v2 */
+ LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct lustre_msg_v2));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+ LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+ LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+ LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n",
+ LUSTRE_MSG_MAGIC_V1);
+ LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+ LUSTRE_MSG_MAGIC_V2);
+ LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n",
+ LUSTRE_MSG_MAGIC_V1_SWABBED);
+ LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+ LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+ /* Checks for struct ptlrpc_body */
+ LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+ (long long)(int)sizeof(struct ptlrpc_body_v3));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+ CLASSERT(PTLRPC_NUM_VERSIONS == 4);
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding));
+ CLASSERT(JOBSTATS_JOBID_SIZE == 32);
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+ (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+ LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n",
+ (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding));
+ LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n",
+ (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding));
+ LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+ (long long)MSG_PTLRPC_BODY_OFF);
+ LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+ (long long)REQ_REC_OFF);
+ LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+ (long long)REPLY_REC_OFF);
+ LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+ (long long)DLM_LOCKREQ_OFF);
+ LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+ (long long)DLM_REQ_REC_OFF);
+ LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+ (long long)DLM_INTENT_IT_OFF);
+ LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+ (long long)DLM_INTENT_REC_OFF);
+ LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+ (long long)DLM_LOCKREPLY_OFF);
+ LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+ (long long)DLM_REPLY_REC_OFF);
+ LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+ (long long)MSG_PTLRPC_HEADER_OFF);
+ LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+ PTLRPC_MSG_VERSION);
+ LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+ LUSTRE_VERSION_MASK);
+ LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+ LUSTRE_OBD_VERSION);
+ LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+ LUSTRE_MDS_VERSION);
+ LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+ LUSTRE_OST_VERSION);
+ LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+ LUSTRE_DLM_VERSION);
+ LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+ LUSTRE_LOG_VERSION);
+ LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+ LUSTRE_MGS_VERSION);
+ LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+ (long long)MSGHDR_AT_SUPPORT);
+ LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+ (long long)MSGHDR_CKSUM_INCOMPAT18);
+ LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_OP_FLAG_MASK);
+ LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+ (long long)MSG_OP_FLAG_SHIFT);
+ LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+ (unsigned)MSG_GEN_FLAG_MASK);
+ LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_LAST_REPLAY);
+ LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_RESENT);
+ LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_REPLAY);
+ LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_DELAY_REPLAY);
+ LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_VERSION_REPLAY);
+ LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_REQ_REPLAY_DONE);
+ LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_LOCK_REPLAY_DONE);
+ LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_RECOVERING);
+ LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_RECONNECT);
+ LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_REPLAYABLE);
+ LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_LIBCLIENT);
+ LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_INITIAL);
+ LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_ASYNC);
+ LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_NEXT_VER);
+ LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+ (unsigned)MSG_CONNECT_TRANSNO);
+
+ /* Checks for struct obd_connect_data */
+ LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+ (long long)(int)sizeof(struct obd_connect_data));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_unused));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+ LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding1));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding2));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding3));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding4));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding5));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding6));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding7));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding8));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+ LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, padding9));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+ LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, paddingA));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+ LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, paddingB));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+ LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, paddingC));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+ LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, paddingD));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+ LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, paddingE));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+ LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+ (long long)(int)offsetof(struct obd_connect_data, paddingF));
+ LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+ LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_RDONLY);
+ LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_INDEX);
+ LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_MDS);
+ LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_GRANT);
+ LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_SRVLOCK);
+ LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_VERSION);
+ LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_REQPORTAL);
+ LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_ACL);
+ LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_XATTR);
+ LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_CROW);
+ LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_TRUNCLOCK);
+ LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_TRANSNO);
+ LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_IBITS);
+ LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_JOIN);
+ LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_ATTRFID);
+ LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_NODEVOH);
+ LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_RMT_CLIENT);
+ LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_RMT_CLIENT_FORCE);
+ LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_BRW_SIZE);
+ LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_QUOTA64);
+ LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_MDS_CAPA);
+ LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_OSS_CAPA);
+ LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_CANCELSET);
+ LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_SOM);
+ LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_AT);
+ LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_LRU_RESIZE);
+ LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_MDS_MDS);
+ LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_REAL);
+ LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_CHANGE_QS);
+ LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_CKSUM);
+ LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_FID);
+ LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_VBR);
+ LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_LOV_V3);
+ LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_GRANT_SHRINK);
+ LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_SKIP_ORPHAN);
+ LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_MAX_EASIZE);
+ LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_FULL20);
+ LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_LAYOUTLOCK);
+ LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_64BITHASH);
+ LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_MAXBYTES);
+ LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_IMP_RECOV);
+ LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_JOBSTATS);
+ LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_UMASK);
+ LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_EINPROGRESS);
+ LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_GRANT_PARAM);
+ LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_FLOCK_OWNER);
+ LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_LVB_TYPE);
+ LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_NANOSEC_TIME);
+ LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_LIGHTWEIGHT);
+ LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_SHORTIO);
+ LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+ OBD_CONNECT_PINGLESS);
+ LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL,
+ "found 0x%.16llxULL\n", OBD_CONNECT_FLOCK_DEAD);
+ LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)OBD_CKSUM_CRC32);
+ LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)OBD_CKSUM_ADLER);
+ LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)OBD_CKSUM_CRC32C);
+
+ /* Checks for struct obdo */
+ LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+ (long long)(int)sizeof(struct obdo));
+ LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_valid));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+ LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_oi));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+ LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_parent_seq));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+ LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_size));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_size));
+ LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_mtime));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+ LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_atime));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+ LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_ctime));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+ LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_blocks));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+ LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_grant));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+ LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_blksize));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+ LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_mode));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+ LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_uid));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+ LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_gid));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+ LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_flags));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+ LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_nlink));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+ LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_parent_oid));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+ LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_misc));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+ LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_ioepoch));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+ LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_stripe_idx));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+ LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_parent_ver));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+ LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_handle));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+ LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_lcookie));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_lcookie));
+ LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_uid_h));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+ LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_gid_h));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+ LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_data_version));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+ LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_padding_4));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+ LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_padding_5));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+ LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_padding_6));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+ LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLID);
+ LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLATIME);
+ LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLMTIME);
+ LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLCTIME);
+ LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLSIZE);
+ LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLBLOCKS);
+ LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLBLKSZ);
+ LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLMODE);
+ LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLTYPE);
+ LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLUID);
+ LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLGID);
+ LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLFLAGS);
+ LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLNLINK);
+ LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLGENER);
+ LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLRDEV);
+ LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLEASIZE);
+ LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_LINKNAME);
+ LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLHANDLE);
+ LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLCKSUM);
+ LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLQOS);
+ LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLCOOKIE);
+ LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLGROUP);
+ LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLFID);
+ LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLEPOCH);
+ LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLGRANT);
+ LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLDIREA);
+ LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLUSRQUOTA);
+ LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLGRPQUOTA);
+ LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLMODEASIZE);
+ LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_MDS);
+ LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_REINT);
+ LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_MEA);
+ LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL),
+ "found 0x%.16llxULL\n", OBD_MD_TSTATE);
+ LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLXATTR);
+ LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLXATTRLS);
+ LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLXATTRRM);
+ LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLACL);
+ LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLRMTPERM);
+ LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLMDSCAPA);
+ LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLOSSCAPA);
+ LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLCKSPLIT);
+ LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLCROSSREF);
+ LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLGETATTRLOCK);
+ LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLRMTLSETFACL);
+ LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLRMTLGETFACL);
+ LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLRMTRSETFACL);
+ LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLRMTRGETFACL);
+ LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+ OBD_MD_FLDATAVERSION);
+ CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
+ CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002);
+ CLASSERT(OBD_FL_DELORPHAN == 0x00000004);
+ CLASSERT(OBD_FL_NORPC == 0x00000008);
+ CLASSERT(OBD_FL_IDONLY == 0x00000010);
+ CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020);
+ CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040);
+ CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100);
+ CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200);
+ CLASSERT(OBD_FL_CREATE_CROW == 0x00000400);
+ CLASSERT(OBD_FL_SRVLOCK == 0x00000800);
+ CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
+ CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
+ CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
+ CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+ CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
+ CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
+ CLASSERT(OBD_FL_MMAP == 0x00040000);
+ CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000);
+ CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000);
+ CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000);
+
+ /* Checks for struct lov_ost_data_v1 */
+ LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct lov_ost_data_v1));
+ LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+ LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+ LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+ LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+ LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+ LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+ /* Checks for struct lov_mds_md_v1 */
+ LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct lov_mds_md_v1));
+ LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+ LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+ LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+ LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+ LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+ LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+ LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+ CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+
+ /* Checks for struct lov_mds_md_v3 */
+ LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+ (long long)(int)sizeof(struct lov_mds_md_v3));
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+ CLASSERT(LOV_MAXPOOLNAME == 16);
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]));
+ LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+ LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+ CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+ LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)LOV_PATTERN_RAID0);
+ LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)LOV_PATTERN_RAID1);
+ LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+ (unsigned)LOV_PATTERN_FIRST);
+ LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
+ (unsigned)LOV_PATTERN_CMOBD);
+
+ /* Checks for struct obd_statfs */
+ LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+ (long long)(int)sizeof(struct obd_statfs));
+ LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_type));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+ LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_blocks));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+ LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_bfree));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+ LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_bavail));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+ LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_ffree));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+ LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_fsid));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+ LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_bsize));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+ LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_namelen));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+ LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_state));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+ LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare2));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare3));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare4));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare5));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare6));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare7));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare8));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+ LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_spare9));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+
+ /* Checks for struct obd_ioobj */
+ LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct obd_ioobj));
+ LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+ LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+ LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+ LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+ LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+ LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+
+ /* Checks for union lquota_id */
+ LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+ (long long)(int)sizeof(union lquota_id));
+
+ LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n",
+ (long long)QUOTABLOCK_BITS);
+ LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n",
+ (long long)QUOTABLOCK_SIZE);
+
+ /* Checks for struct obd_quotactl */
+ LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+ (long long)(int)sizeof(struct obd_quotactl));
+ LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+ LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+ LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct obd_quotactl, qc_type));
+ LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+ LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct obd_quotactl, qc_id));
+ LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+ LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+ LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+ LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+ LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+ LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+ LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+ /* Checks for struct obd_dqinfo */
+ LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct obd_dqinfo));
+ LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+ LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+ LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+ LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+ LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+ LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+ LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+ LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+ /* Checks for struct obd_dqblk */
+ LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+ (long long)(int)sizeof(struct obd_dqblk));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+ LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+ (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+ LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+ LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+ Q_QUOTACHECK);
+ LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+ Q_INITQUOTA);
+ LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+ Q_GETOINFO);
+ LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+ Q_GETOQUOTA);
+ LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+ Q_FINVALIDATE);
+
+ /* Checks for struct lquota_acct_rec */
+ LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+ (long long)(int)sizeof(struct lquota_acct_rec));
+ LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+ LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+ LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+ LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+ /* Checks for struct lquota_glb_rec */
+ LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct lquota_glb_rec));
+ LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+ LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+ LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+ LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+ LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+ LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+ LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+ LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+ /* Checks for struct lquota_slv_rec */
+ LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+ (long long)(int)sizeof(struct lquota_slv_rec));
+ LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+ LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+ /* Checks for struct idx_info */
+ LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+ (long long)(int)sizeof(struct idx_info));
+ LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_magic));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+ LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_flags));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+ LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_count));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+ LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_pad0));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+ LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_attrs));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+ LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_fid));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+ LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_version));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+ LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_hash_start));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+ LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_hash_end));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+ LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_keysize));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+ LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_recsize));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+ LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_pad1));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+ LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_pad2));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+ LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct idx_info, ii_pad3));
+ LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+ CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37);
+
+ /* Checks for struct lu_idxpage */
+ LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+ (long long)(int)sizeof(struct lu_idxpage));
+ LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+ LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+ LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+ LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+ LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+ (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+ LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+ LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+ LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+ CLASSERT(LIP_MAGIC == 0x8A6D6B6C);
+ LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n",
+ (long long)LIP_HDR_SIZE);
+ LASSERTF(II_FL_NOHASH == 1, "found %lld\n",
+ (long long)II_FL_NOHASH);
+ LASSERTF(II_FL_VARKEY == 2, "found %lld\n",
+ (long long)II_FL_VARKEY);
+ LASSERTF(II_FL_VARREC == 4, "found %lld\n",
+ (long long)II_FL_VARREC);
+ LASSERTF(II_FL_NONUNQ == 8, "found %lld\n",
+ (long long)II_FL_NONUNQ);
+
+ /* Checks for struct niobuf_remote */
+ LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+ (long long)(int)sizeof(struct niobuf_remote));
+ LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct niobuf_remote, offset));
+ LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct niobuf_remote *)0)->offset));
+ LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct niobuf_remote, len));
+ LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct niobuf_remote *)0)->len));
+ LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct niobuf_remote, flags));
+ LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct niobuf_remote *)0)->flags));
+ LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+ OBD_BRW_READ);
+ LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+ OBD_BRW_WRITE);
+ LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+ OBD_BRW_SYNC);
+ LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+ OBD_BRW_CHECK);
+ LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+ OBD_BRW_FROM_GRANT);
+ LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+ OBD_BRW_GRANTED);
+ LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+ OBD_BRW_NOCACHE);
+ LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+ OBD_BRW_NOQUOTA);
+ LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+ OBD_BRW_SRVLOCK);
+ LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+ OBD_BRW_ASYNC);
+ LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+ OBD_BRW_MEMALLOC);
+
+ /* Checks for struct ost_body */
+ LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+ (long long)(int)sizeof(struct ost_body));
+ LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ost_body, oa));
+ LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+ /* Checks for struct ll_fid */
+ LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(struct ll_fid));
+ LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fid, id));
+ LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fid *)0)->id));
+ LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fid, generation));
+ LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+ LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fid, f_type));
+ LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+ /* Checks for struct mdt_body */
+ LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_body));
+ LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, fid1));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->fid1));
+ LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, fid2));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->fid2));
+ LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, handle));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->handle));
+ LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, valid));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->valid));
+ LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, size));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->size));
+ LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, mtime));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->mtime));
+ LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, atime));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->atime));
+ LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, ctime));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->ctime));
+ LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, blocks));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->blocks));
+ LASSERTF((int)offsetof(struct mdt_body, t_state) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, t_state));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->t_state) == 8,
+ "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->t_state));
+ LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, fsuid));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->fsuid));
+ LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, fsgid));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->fsgid));
+ LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, capability));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->capability));
+ LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, mode));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->mode));
+ LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, uid));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->uid));
+ LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, gid));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->gid));
+ LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, flags));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->flags));
+ LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, rdev));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->rdev));
+ LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, nlink));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->nlink));
+ LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, unused2));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->unused2));
+ LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, suppgid));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->suppgid));
+ LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, eadatasize));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize));
+ LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, aclsize));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->aclsize));
+ LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, max_mdsize));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize));
+ LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, max_cookiesize));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize));
+ LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, uid_h));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->uid_h));
+ LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, gid_h));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->gid_h));
+ LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, padding_5));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->padding_5));
+ LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, padding_6));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->padding_6));
+ LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, padding_7));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->padding_7));
+ LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, padding_8));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->padding_8));
+ LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, padding_9));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->padding_9));
+ LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_body, padding_10));
+ LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_body *)0)->padding_10));
+ LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+ MDS_FMODE_CLOSED);
+ LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+ MDS_FMODE_EXEC);
+ LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n",
+ MDS_FMODE_EPOCH);
+ LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n",
+ MDS_FMODE_TRUNC);
+ LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n",
+ MDS_FMODE_SOM);
+ LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+ MDS_OPEN_CREATED);
+ LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+ MDS_OPEN_CROSS);
+ LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+ MDS_OPEN_CREAT);
+ LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+ MDS_OPEN_EXCL);
+ LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+ MDS_OPEN_TRUNC);
+ LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+ MDS_OPEN_APPEND);
+ LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+ MDS_OPEN_SYNC);
+ LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+ MDS_OPEN_DIRECTORY);
+ LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+ MDS_OPEN_BY_FID);
+ LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+ MDS_OPEN_DELAY_CREATE);
+ LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+ MDS_OPEN_OWNEROVERRIDE);
+ LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+ MDS_OPEN_JOIN_FILE);
+ LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+ MDS_OPEN_LOCK);
+ LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+ MDS_OPEN_HAS_EA);
+ LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+ MDS_OPEN_HAS_OBJS);
+ LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+ (long long)MDS_OPEN_NORESTORE);
+ LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+ (long long)MDS_OPEN_NEWSTRIPE);
+ LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+ (long long)MDS_OPEN_VOLATILE);
+ LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n",
+ LUSTRE_SYNC_FL);
+ LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n",
+ LUSTRE_IMMUTABLE_FL);
+ LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n",
+ LUSTRE_APPEND_FL);
+ LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n",
+ LUSTRE_NOATIME_FL);
+ LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n",
+ LUSTRE_DIRSYNC_FL);
+ LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n",
+ MDS_INODELOCK_LOOKUP);
+ LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n",
+ MDS_INODELOCK_UPDATE);
+ LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n",
+ MDS_INODELOCK_OPEN);
+ LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
+ MDS_INODELOCK_LAYOUT);
+
+ /* Checks for struct mdt_ioepoch */
+ LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_ioepoch));
+ LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_ioepoch, handle));
+ LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle));
+ LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_ioepoch, ioepoch));
+ LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch));
+ LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_ioepoch, flags));
+ LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags));
+ LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_ioepoch, padding));
+ LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding));
+
+ /* Checks for struct mdt_remote_perm */
+ LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_remote_perm));
+ LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_remote_perm, rp_uid));
+ LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid));
+ LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_remote_perm, rp_gid));
+ LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid));
+ LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid));
+ LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid));
+ LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm));
+ LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm));
+ LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_remote_perm, rp_padding));
+ LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding));
+ LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)CFS_SETUID_PERM);
+ LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)CFS_SETGID_PERM);
+ LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n",
+ (unsigned)CFS_SETGRP_PERM);
+ LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n",
+ (unsigned)CFS_RMTACL_PERM);
+ LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n",
+ (unsigned)CFS_RMTOWN_PERM);
+
+ /* Checks for struct mdt_rec_setattr */
+ LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_setattr));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+ LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+ LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+ /* Checks for struct mdt_rec_create */
+ LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_create));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+ LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+ LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+ /* Checks for struct mdt_rec_link */
+ LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_link));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+ LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+ LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+ /* Checks for struct mdt_rec_unlink */
+ LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_unlink));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+ LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+ LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+ /* Checks for struct mdt_rec_rename */
+ LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_rename));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+ LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+ LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+ /* Checks for struct mdt_rec_setxattr */
+ LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_setxattr));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+ LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+ LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+ /* Checks for struct mdt_rec_reint */
+ LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_reint));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+ LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+ LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+ /* Checks for struct lmv_desc */
+ LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+ (long long)(int)sizeof(struct lmv_desc));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+ LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+ LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+ /* Checks for struct lmv_stripe_md */
+ LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct lmv_stripe_md));
+ LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_stripe_md, mea_magic));
+ LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic));
+ LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_stripe_md, mea_count));
+ LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count));
+ LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_stripe_md, mea_master));
+ LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master));
+ LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_stripe_md, mea_padding));
+ LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding));
+ CLASSERT(LOV_MAXPOOLNAME == 16);
+ LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16]));
+ LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]));
+ LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0]));
+ LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]));
+
+ /* Checks for struct lov_desc */
+ LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+ (long long)(int)sizeof(struct lov_desc));
+ LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+ LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+ LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+ LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_pattern));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+ LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+ LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+ LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+ LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+ LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+ LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+ LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_uuid));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+ CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C);
+
+ /* Checks for struct ldlm_res_id */
+ LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_res_id));
+ CLASSERT(RES_NAME_SIZE == 4);
+ LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+ LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+ /* Checks for struct ldlm_extent */
+ LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_extent));
+ LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_extent, start));
+ LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+ LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_extent, end));
+ LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+ LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_extent, gid));
+ LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+ /* Checks for struct ldlm_inodebits */
+ LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_inodebits));
+ LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_inodebits, bits));
+ LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
+ /* Checks for struct ldlm_flock_wire */
+ LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_flock_wire));
+ LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+ LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+ LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+ LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+ LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+ LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+ LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+ LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+ LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+ LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+ /* Checks for struct ldlm_intent */
+ LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_intent));
+ LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_intent, opc));
+ LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+
+ /* Checks for struct ldlm_resource_desc */
+ LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_resource_desc));
+ LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+ LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+ LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+ LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
+ LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+ LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+ /* Checks for struct ldlm_lock_desc */
+ LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_lock_desc));
+ LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+ LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+ LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+ LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+ LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+ LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+ LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+ LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+ /* Checks for struct ldlm_request */
+ LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_request));
+ LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_request, lock_flags));
+ LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+ LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_request, lock_count));
+ LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+ LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_request, lock_desc));
+ LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+ LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_request, lock_handle));
+ LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+ /* Checks for struct ldlm_reply */
+ LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_reply));
+ LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+ LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+ LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+ LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+ LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+ LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+ LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+ LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+ LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+ LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+ LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+ LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+ /* Checks for struct ost_lvb_v1 */
+ LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+ (long long)(int)sizeof(struct ost_lvb_v1));
+ LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+ LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+ LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+ LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+ LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+ LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+ LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+ LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+ LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+ LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+ /* Checks for struct ost_lvb */
+ LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+ (long long)(int)sizeof(struct ost_lvb));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_size));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+ LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+ (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+ LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+ /* Checks for struct lquota_lvb */
+ LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+ (long long)(int)sizeof(struct lquota_lvb));
+ LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+ LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+ LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+ LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+ LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+ LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+ LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+ LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+ LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+ LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+ LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+ (long long)LQUOTA_FL_EDQUOT);
+
+ /* Checks for struct ldlm_gl_lquota_desc */
+ LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+ LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+ LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+ LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+ LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+ LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+ LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+ LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+ LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+ LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+ LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+ LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+ LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+ LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+ LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+
+ /* Checks for struct mgs_send_param */
+ LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+ (long long)(int)sizeof(struct mgs_send_param));
+ CLASSERT(MGS_PARAM_MAXLEN == 1024);
+ LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+ LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+ /* Checks for struct cfg_marker */
+ LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+ (long long)(int)sizeof(struct cfg_marker));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_step));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_flags));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_vers));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_padding));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+ LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct cfg_marker, cm_comment));
+ LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+ (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+ /* Checks for struct llog_logid */
+ LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+ (long long)(int)sizeof(struct llog_logid));
+ LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid, lgl_oi));
+ LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+ LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+ LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+ CLASSERT(OST_SZ_REC == 274730752);
+ CLASSERT(MDS_UNLINK_REC == 274801668);
+ CLASSERT(MDS_UNLINK64_REC == 275325956);
+ CLASSERT(MDS_SETATTR64_REC == 275325953);
+ CLASSERT(OBD_CFG_REC == 274857984);
+ CLASSERT(LLOG_GEN_REC == 274989056);
+ CLASSERT(CHANGELOG_REC == 275120128);
+ CLASSERT(CHANGELOG_USER_REC == 275185664);
+ CLASSERT(LLOG_HDR_MAGIC == 275010873);
+ CLASSERT(LLOG_LOGID_MAGIC == 275010875);
+
+ /* Checks for struct llog_catid */
+ LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct llog_catid));
+ LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_catid, lci_logid));
+ LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+ LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct llog_catid, lci_padding1));
+ LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+ LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct llog_catid, lci_padding2));
+ LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+ LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct llog_catid, lci_padding3));
+ LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+ /* Checks for struct llog_rec_hdr */
+ LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(struct llog_rec_hdr));
+ LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+ LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+ LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+ LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+ LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+ LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+ LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+ LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+ /* Checks for struct llog_rec_tail */
+ LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(struct llog_rec_tail));
+ LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+ LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+ LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+ LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+ /* Checks for struct llog_logid_rec */
+ LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct llog_logid_rec));
+ LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+ LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+ LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+ LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+ LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+ LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+ LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+ LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+ LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+ LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+ LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+ LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+ /* Checks for struct llog_unlink_rec */
+ LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+ (long long)(int)sizeof(struct llog_unlink_rec));
+ LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+ LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+ LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+ LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+ LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+ LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+ LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+ LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+ LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+ LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+ /* Checks for struct llog_unlink64_rec */
+ LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct llog_unlink64_rec));
+ LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+ LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+ LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+ LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+ LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+ LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+ LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+ LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+ LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+ LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+ LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+ LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+ LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+ LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+ /* Checks for struct llog_setattr64_rec */
+ LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct llog_setattr64_rec));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding));
+ LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+ LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
+ /* Checks for struct llog_size_change_rec */
+ LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct llog_size_change_rec));
+ LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+ LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+ LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+ LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+ LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+ LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+ LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+ LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+ LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+ LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+ LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+ LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+ LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+ LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+ /* Checks for struct changelog_rec */
+ LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct changelog_rec));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_flags));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_type));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_index));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_prev));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_time));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+ LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+ LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+ /* Checks for struct changelog_ext_rec */
+ LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n",
+ (long long)(int)sizeof(struct changelog_ext_rec));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_flags));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_type));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_index));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_prev));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_time));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid));
+ LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid));
+ LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid));
+
+ /* Checks for struct changelog_setinfo */
+ LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+ (long long)(int)sizeof(struct changelog_setinfo));
+ LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+ LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+ LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+ LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+ /* Checks for struct llog_changelog_rec */
+ LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+ (long long)(int)sizeof(struct llog_changelog_rec));
+ LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+ LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+ LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_rec, cr));
+ LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+ LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_rec, cr_tail));
+ LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail));
+
+ /* Checks for struct llog_changelog_user_rec */
+ LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+ (long long)(int)sizeof(struct llog_changelog_user_rec));
+ LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+ LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+ LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+ LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+ LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+ LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+ LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+ LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+ LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+ LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+
+ /* Checks for struct llog_gen */
+ LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+ (long long)(int)sizeof(struct llog_gen));
+ LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+ LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+ LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct llog_gen, conn_cnt));
+ LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+ /* Checks for struct llog_gen_rec */
+ LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct llog_gen_rec));
+ LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+ LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+ LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+ LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+ LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+ LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+ /* Checks for struct llog_log_hdr */
+ LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+ (long long)(int)sizeof(struct llog_log_hdr));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_reserved));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap));
+ LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n",
+ (long long)(int)offsetof(struct llog_log_hdr, llh_tail));
+ LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail));
+
+ /* Checks for struct llog_cookie */
+ LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct llog_cookie));
+ LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llog_cookie, lgc_lgl));
+ LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl));
+ LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct llog_cookie, lgc_subsys));
+ LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys));
+ LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct llog_cookie, lgc_index));
+ LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+ LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+ LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
+
+ /* Checks for struct llogd_body */
+ LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+ (long long)(int)sizeof(struct llogd_body));
+ LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_body, lgd_logid));
+ LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+ LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+ LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+ LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+ LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+ LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_body, lgd_index));
+ LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+ LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+ LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+ LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_body, lgd_len));
+ LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+ LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+ LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+ CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+ CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+ CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+ CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+ CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+ CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+ CLASSERT(LLOG_CATINFO == 507);
+ CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+ CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
+ CLASSERT(LLOG_FIRST_OPC == 501);
+ CLASSERT(LLOG_LAST_OPC == 510);
+
+ /* Checks for struct llogd_conn_body */
+ LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+ (long long)(int)sizeof(struct llogd_conn_body));
+ LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+ LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+ LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+ LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+ LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+ LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+ /* Checks for struct ll_fiemap_info_key */
+ LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+ (long long)(int)sizeof(struct ll_fiemap_info_key));
+ LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_info_key, name[8]));
+ LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]));
+ LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_info_key, oa));
+ LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa));
+ LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap));
+ LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap));
+
+ /* Checks for struct quota_body */
+ LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+ (long long)(int)sizeof(struct quota_body));
+ LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_fid));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+ LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_id));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+ LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_flags));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+ LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_padding));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+ LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_count));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+ LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_usage));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+ LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+ LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_lockh));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+ LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+ LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+ LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+
+ /* Checks for struct mgs_target_info */
+ LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+ (long long)(int)sizeof(struct mgs_target_info));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+ LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+ (long long)(int)offsetof(struct mgs_target_info, mti_params));
+ LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+ (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+ /* Checks for struct lustre_capa */
+ LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n",
+ (long long)(int)sizeof(struct lustre_capa));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_fid));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_opc));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_uid));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_gid));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_flags));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_keyid));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_timeout));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout));
+ LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_expiry));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry));
+ CLASSERT(CAPA_HMAC_MAX_LEN == 64);
+ LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa, lc_hmac[64]));
+ LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]));
+
+ /* Checks for struct lustre_capa_key */
+ LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n",
+ (long long)(int)sizeof(struct lustre_capa_key));
+ LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa_key, lk_seq));
+ LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq));
+ LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa_key, lk_keyid));
+ LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid));
+ LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa_key, lk_padding));
+ LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding));
+ CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56);
+ LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct lustre_capa_key, lk_key[56]));
+ LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]));
+
+ /* Checks for struct getinfo_fid2path */
+ LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct getinfo_fid2path));
+ LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+ LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+ LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+ LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+ LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+ LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+ LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+ LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+ LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0]));
+ LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n",
+ (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]));
+
+ /* Checks for struct ll_user_fiemap */
+ LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct ll_user_fiemap));
+ LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+ LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+ LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+ LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+ LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+ LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+ LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+ LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+ LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+ LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+ LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+ LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+ LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+ LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+ CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+ CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+ CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+ /* Checks for struct ll_fiemap_extent */
+ LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n",
+ (long long)(int)sizeof(struct ll_fiemap_extent));
+ LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+ LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+ LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+ LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+ LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+ LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+ LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+ LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+ LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+ LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+ CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+ CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+ CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+ CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
+ CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+ CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+ CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+ CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+ CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+ CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+ CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+ CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+
+ /* Checks for type posix_acl_xattr_entry */
+ LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+ (long long)(int)sizeof(posix_acl_xattr_entry));
+ LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+ (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+ LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+ (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+ LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+ (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+ LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+ (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+ LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+ (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+ LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+
+ /* Checks for type posix_acl_xattr_header */
+ LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+ (long long)(int)sizeof(posix_acl_xattr_header));
+ LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+ (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+ LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+ LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+ (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+ LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+ (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+
+ /* Checks for struct link_ea_header */
+ LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct link_ea_header));
+ LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_header, leh_magic));
+ LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+ LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+ LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+ LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_header, leh_len));
+ LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+ LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_header, padding1));
+ LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_header *)0)->padding1));
+ LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_header, padding2));
+ LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_header *)0)->padding2));
+ CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL);
+
+ /* Checks for struct link_ea_entry */
+ LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+ (long long)(int)sizeof(struct link_ea_entry));
+ LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+ LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+ LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+ LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+ LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+ (long long)(int)offsetof(struct link_ea_entry, lee_name));
+ LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+ /* Checks for struct layout_intent */
+ LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct layout_intent));
+ LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct layout_intent, li_opc));
+ LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+ LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct layout_intent, li_flags));
+ LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+ LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct layout_intent, li_start));
+ LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+ LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct layout_intent, li_end));
+ LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+ LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+ (long long)LAYOUT_INTENT_ACCESS);
+ LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+ (long long)LAYOUT_INTENT_READ);
+ LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+ (long long)LAYOUT_INTENT_WRITE);
+ LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+ (long long)LAYOUT_INTENT_GLIMPSE);
+ LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+ (long long)LAYOUT_INTENT_TRUNC);
+ LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+ (long long)LAYOUT_INTENT_RELEASE);
+ LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+ (long long)LAYOUT_INTENT_RESTORE);
+
+ /* Checks for struct hsm_action_item */
+ LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_action_item));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_len));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_action));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+ LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_item, hai_data));
+ LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+ /* Checks for struct hsm_action_list */
+ LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_action_list));
+ LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_list, hal_version));
+ LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+ LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_list, hal_count));
+ LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+ LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+ LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+ LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+ LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+ LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+ LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+ LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_list, padding1));
+ LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+ LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+ LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+ /* Checks for struct hsm_progress */
+ LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_progress));
+ LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress, hp_fid));
+ LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+ LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+ LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+ LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress, hp_extent));
+ LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+ LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress, hp_flags));
+ LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+ LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress, hp_errval));
+ LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+ LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress, padding));
+ LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+ LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+ HP_FLAG_COMPLETED);
+ LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+ HP_FLAG_RETRY);
+
+ LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+ LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+ LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_copy, hc_flags));
+ LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+ LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_copy, hc_errval));
+ LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+ LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_copy, padding));
+ LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+ LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_copy, hc_hai));
+ LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+ /* Checks for struct hsm_progress_kernel */
+ LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_progress_kernel));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+ LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+ LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+ /* Checks for struct hsm_user_item */
+ LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_user_item));
+ LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+ LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+ LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+ LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+ /* Checks for struct hsm_user_state */
+ LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_user_state));
+ LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_state, hus_states));
+ LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+ LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+ LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+ LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+ LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+ LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+ LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+ LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+ LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+ /* Checks for struct hsm_state_set */
+ LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_state_set));
+ LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+ LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+ LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+ LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+ LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+ LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+ LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+ LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+
+ /* Checks for struct hsm_current_action */
+ LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_current_action));
+ LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_current_action, hca_state));
+ LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+ LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_current_action, hca_action));
+ LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+ LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_current_action, hca_location));
+ LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+
+ /* Checks for struct hsm_request */
+ LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_request));
+ LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_request, hr_action));
+ LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+ LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+ LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+ LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_request, hr_flags));
+ LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+ LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+ LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+ LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_request, hr_data_len));
+ LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+ LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+ (unsigned)HSM_FORCE_ACTION);
+ LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+ (unsigned)HSM_GHOST_COPY);
+
+ /* Checks for struct hsm_user_request */
+ LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+ (long long)(int)sizeof(struct hsm_user_request));
+ LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_request, hur_request));
+ LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+ LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+ LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+ /* Checks for struct hsm_user_import */
+ LASSERTF(sizeof(struct hsm_user_import) == 48, "found %lld\n",
+ (long long)sizeof(struct hsm_user_import));
+ LASSERTF(offsetof(struct hsm_user_import, hui_size) == 0,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_size));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_size) == 8,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_size));
+ LASSERTF(offsetof(struct hsm_user_import, hui_uid) == 32,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_uid));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_uid) == 4,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_uid));
+ LASSERTF(offsetof(struct hsm_user_import, hui_gid) == 36,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_gid));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_gid) == 4,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_gid));
+ LASSERTF(offsetof(struct hsm_user_import, hui_mode) == 40,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_mode));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mode) == 4,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_mode));
+ LASSERTF(offsetof(struct hsm_user_import, hui_atime) == 8,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_atime));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime) == 8,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_atime));
+ LASSERTF(offsetof(struct hsm_user_import, hui_atime_ns) == 24,
+ "found %lld\n",
+ (long long)(int)offsetof(struct hsm_user_import, hui_atime_ns));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_atime_ns));
+ LASSERTF(offsetof(struct hsm_user_import, hui_mtime) == 16,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_mtime));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_mtime));
+ LASSERTF(offsetof(struct hsm_user_import, hui_mtime_ns) == 28,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_mtime_ns));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns));
+ LASSERTF(offsetof(struct hsm_user_import, hui_archive_id) == 44,
+ "found %lld\n",
+ (long long)offsetof(struct hsm_user_import, hui_archive_id));
+ LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4,
+ "found %lld\n",
+ (long long)sizeof(((struct hsm_user_import *)0)->hui_archive_id));
+
+ /* Checks for struct update_buf */
+ LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n",
+ (long long)(int)sizeof(struct update_buf));
+ LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct update_buf, ub_magic));
+ LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct update_buf *)0)->ub_magic));
+ LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct update_buf, ub_count));
+ LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct update_buf *)0)->ub_count));
+ LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct update_buf, ub_bufs));
+ LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs));
+
+ /* Checks for struct update_reply */
+ LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n",
+ (long long)(int)sizeof(struct update_reply));
+ LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct update_reply, ur_version));
+ LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct update_reply *)0)->ur_version));
+ LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct update_reply, ur_count));
+ LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct update_reply *)0)->ur_count));
+ LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct update_reply, ur_lens));
+ LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct update_reply *)0)->ur_lens));
+
+ /* Checks for struct update */
+ LASSERTF((int)sizeof(struct update) == 56, "found %lld\n",
+ (long long)(int)sizeof(struct update));
+ LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct update, u_type));
+ LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct update *)0)->u_type));
+ LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct update, u_batchid));
+ LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct update *)0)->u_batchid));
+ LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct update, u_fid));
+ LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct update *)0)->u_fid));
+ LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct update, u_lens));
+ LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n",
+ (long long)(int)sizeof(((struct update *)0)->u_lens));
+ LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct update, u_bufs));
+ LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n",
+ (long long)(int)sizeof(((struct update *)0)->u_bufs));
+}