vfs: Add open by file handle support [Kernel]

Prev: [PATCH 6/8] ceph: sync ceph_fs.h with the server tree version
Next: [PATCH] lib/atomic64_test: Fix missing include of linux/kernel.h

From: Neil Brown on 12 May 2010 19:50

On Wed, 12 May 2010 21:20:39 +0530
"Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote:

> Acked-by: Serge Hallyn <serue(a)us.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.vnet.ibm.com>
> ---
> fs/namei.c | 24 ---------
> fs/open.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/namei.h | 24 +++++++++
> 3 files changed, 160 insertions(+), 24 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index a7dce91..a18711e 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -1521,30 +1521,6 @@ out_unlock:
> return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
> }
>
> -/*
> - * Note that while the flag value (low two bits) for sys_open means:
> - * 00 - read-only
> - * 01 - write-only
> - * 10 - read-write
> - * 11 - special
> - * it is changed into
> - * 00 - no permissions needed
> - * 01 - read-permission
> - * 10 - write-permission
> - * 11 - read-write
> - * for the internal routines (ie open_namei()/follow_link() etc)
> - * This is more logical, and also allows the 00 "no perm needed"
> - * to be used for symlinks (where the permissions are checked
> - * later).
> - *
> -*/
> -static inline int open_to_namei_flags(int flag)
> -{
> - if ((flag+1) & O_ACCMODE)
> - flag++;
> - return flag;
> -}
> -
> static int open_will_truncate(int flag, struct inode *inode)
> {
> /*
> diff --git a/fs/open.c b/fs/open.c
> index 9a34b81..348a1b9 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1315,3 +1315,139 @@ err_out:
> asmlinkage_protect(4, ret, dfd, name, handle, flag);
> return ret;
> }
> +
> +static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
> +{
> + return 1;
> +}
> +
> +static struct dentry *handle_to_dentry(struct vfsmount *mnt,
> + struct file_handle *handle)
> +{
> + int handle_size;
> + struct dentry *dentry;
> +
> + /* change the handle size to multiple of sizeof(u32) */
> + handle_size = handle->handle_size >> 2;
> + dentry = exportfs_decode_fh(mnt, (struct fid *)handle->f_handle,
> + handle_size, handle->handle_type,
> + vfs_dentry_acceptable, NULL);
> + return dentry;
> +}
> +
> +static long do_sys_open_by_handle(struct file_handle __user *ufh, int flags)
> +{
> + int fd;
> + int retval = 0;
> + int d_flags = flags;
> + struct file *filp;
> + struct vfsmount *mnt;
> + struct inode *inode;
> + struct dentry *dentry;
> + struct file_handle f_handle;
> + struct file_handle *handle = NULL;
> +
> + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
> + retval = -EFAULT;
> + goto out_err;
> + }
> + if ((f_handle.handle_size > MAX_HANDLE_SZ) ||
> + (f_handle.handle_size <= 0)) {
> + retval = -EINVAL;
> + goto out_err;
> + }
> + if (!capable(CAP_DAC_OVERRIDE)) {
> + retval = -EPERM;
> + goto out_err;
> + }
> + /*
> + * Find the vfsmount for this uuid in the
> + * current namespace
> + */
> + mnt = fs_get_vfsmount(current, &f_handle.fsid);
> + if (!mnt) {
> + retval = -ESTALE;
> + goto out_err;
> + }
> +
> + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size,
> + GFP_KERNEL);
> + if (!handle) {
> + retval = -ENOMEM;
> + goto out_mnt;
> + }
> + /* copy the full handle */
> + if (copy_from_user(handle, ufh,
> + sizeof(struct file_handle) +
> + f_handle.handle_size)) {
> + retval = -EFAULT;
> + goto out_mnt;
> + }
> + dentry = handle_to_dentry(mnt, handle);
> + if (IS_ERR(dentry)) {
> + retval = PTR_ERR(dentry);
> + goto out_mnt;
> + }
> + inode = dentry->d_inode;
> + flags = open_to_namei_flags(flags);
> + /* O_TRUNC implies we need access checks for write permissions */
> + if (flags & O_TRUNC)
> + flags |= MAY_WRITE;
> +
> + if ((!(flags & O_APPEND) || (flags & O_TRUNC)) &&
> + (flags & FMODE_WRITE) && IS_APPEND(inode)) {
> + retval = -EPERM;
> + goto out_dentry;
> + }
> + if ((flags & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
> + retval = -EACCES;
> + goto out_dentry;
> + }
> + /* Can't write directories. */
> + if (S_ISDIR(inode->i_mode) && (flags & FMODE_WRITE)) {
> + retval = -EISDIR;
> + goto out_dentry;
> + }

Including all these checks inline here seems error prone. Can you not just
use finish_open ?? It might do more than you need, but it would be more
obvious that you didn't forget anything..

> + fd = get_unused_fd_flags(d_flags);
> + if (fd < 0) {
> + retval = fd;
> + goto out_dentry;
> + }
> + filp = dentry_open(dget(dentry), mntget(mnt),
> + d_flags, current_cred());
> + if (IS_ERR(filp)) {
> + put_unused_fd(fd);
> + retval = PTR_ERR(filp);
> + goto out_dentry;
> + }
> + if (inode->i_mode & S_IFREG) {

I suspect this is not the test you want. It tests for IFREG or IFLNK or
IFSOCK.

> + filp->f_flags |= O_NOATIME;
> + filp->f_mode |= FMODE_NOCMTIME;
> + }

I think you need a comment here explaining the rational for these setting.
Why is O_NOATIME important IFREG but not for IFDIR?
Why is it not sufficient to honour O_NOATIME that is passed in.
How can you ever justify setting FMODE_NOCMTIME ?
I guess you are just copying from xfs code, but it still needs justification.

NeilBrown

> + fsnotify_open(filp->f_path.dentry);
> + fd_install(fd, filp);
> + retval = fd;
> +
> +out_dentry:
> + dput(dentry);
> +out_mnt:
> + kfree(handle);
> + mntput(mnt);
> +out_err:
> + return retval;
> +}
> +
> +SYSCALL_DEFINE2(open_by_handle, struct file_handle __user *, handle,
> + int, flags)
> +{
> + long ret;
> +
> + if (force_o_largefile())
> + flags |= O_LARGEFILE;
> +
> + ret = do_sys_open_by_handle(handle, flags);
> +
> + /* avoid REGPARM breakage on x86: */
> + asmlinkage_protect(2, ret, handle, flags);
> + return ret;
> +}
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index 05b441d..a853aa0 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -4,6 +4,7 @@
> #include <linux/dcache.h>
> #include <linux/linkage.h>
> #include <linux/path.h>
> +#include <asm-generic/fcntl.h>
>
> struct vfsmount;
>
> @@ -96,4 +97,27 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
> ((char *) name)[min(len, maxlen)] = '\0';
> }
>
> +/*
> + * Note that while the flag value (low two bits) for sys_open means:
> + * 00 - read-only
> + * 01 - write-only
> + * 10 - read-write
> + * 11 - special
> + * it is changed into
> + * 00 - no permissions needed
> + * 01 - read-permission
> + * 10 - write-permission
> + * 11 - read-write
> + * for the internal routines (ie open_namei()/follow_link() etc)
> + * This is more logical, and also allows the 00 "no perm needed"
> + * to be used for symlinks (where the permissions are checked
> + * later).
> + *
> +*/
> +static inline int open_to_namei_flags(int flag)
> +{
> + if ((flag+1) & O_ACCMODE)
> + flag++;
> + return flag;
> +}
> #endif /* _LINUX_NAMEI_H */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Dave Chinner on 13 May 2010 02:20

On Thu, May 13, 2010 at 09:44:22AM +1000, Neil Brown wrote:
> On Wed, 12 May 2010 21:20:39 +0530
> "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote:
>
> > + filp->f_flags |= O_NOATIME;
> > + filp->f_mode |= FMODE_NOCMTIME;
> > + }
>
> I think you need a comment here explaining the rational for these setting.

If you've never seen how applications use the XFS handle interface
in conjunction with other XFS functionality, then I guess if would
seem like bad voodoo.

> Why is O_NOATIME important IFREG but not for IFDIR?

No application has ever required directory access or modification
via the handle interface to be invisible to the rest of the system.

> Why is it not sufficient to honour O_NOATIME that is passed in.

Because the XFS handle library is cross platform and predates
O_NOATIME on linux. Hence the library it has never set that flag and
always relied on the kernel implementation of the API to ensure
atime was never updated on fds derived from handles..

> How can you ever justify setting FMODE_NOCMTIME ?

Quite easily. ;)

The XFS handle interface was designed specifically to allow
applications to execute silent/invisible movement of data in, out
and around the filesystem without leaving user visible traces in
file metadata. This enables backup or filesysetm utilities that
operate on active filesystems need to be able to access or modify
inodes and data without affecting running applications. It's a
feature of the handle interface, and used by xfs_dump, xfs_fsr,
SGI's HSM, etc to do stuff that isn't otherwise possible.

FWIW, if you are curious, here's the initial commit of the XFS
handle code into Irix tree from 3 Sep 1994, showing that the initial
XFS open_by_handle() implementation sets the FINVIS flag to trigger
invisible IO semantics:

http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=575b66fae833429a51fcadb204d45521c2dfc26f

> I guess you are just copying from xfs code, but it still needs justification.

"They are intended for use by a limited set of system
utilities such as backup programs."

- open_by_handle(3) man page

Cheers,

Dave.
--
Dave Chinner
david(a)fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Aneesh Kumar K. V on 13 May 2010 02:40

On Thu, 13 May 2010 16:09:55 +1000, Dave Chinner <david(a)fromorbit.com> wrote:
> On Thu, May 13, 2010 at 09:44:22AM +1000, Neil Brown wrote:
> > On Wed, 12 May 2010 21:20:39 +0530
> > "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote:
> >
> > > + filp->f_flags |= O_NOATIME;
> > > + filp->f_mode |= FMODE_NOCMTIME;
> > > + }
> >
> > I think you need a comment here explaining the rational for these setting.
>
> If you've never seen how applications use the XFS handle interface
> in conjunction with other XFS functionality, then I guess if would
> seem like bad voodoo.
>
> > Why is O_NOATIME important IFREG but not for IFDIR?
>
> No application has ever required directory access or modification
> via the handle interface to be invisible to the rest of the system.
>
> > Why is it not sufficient to honour O_NOATIME that is passed in.
>
> Because the XFS handle library is cross platform and predates
> O_NOATIME on linux. Hence the library it has never set that flag and
> always relied on the kernel implementation of the API to ensure
> atime was never updated on fds derived from handles..
>
> > How can you ever justify setting FMODE_NOCMTIME ?
>
> Quite easily. ;)
>
> The XFS handle interface was designed specifically to allow
> applications to execute silent/invisible movement of data in, out
> and around the filesystem without leaving user visible traces in
> file metadata. This enables backup or filesysetm utilities that
> operate on active filesystems need to be able to access or modify
> inodes and data without affecting running applications. It's a
> feature of the handle interface, and used by xfs_dump, xfs_fsr,
> SGI's HSM, etc to do stuff that isn't otherwise possible.
>
> FWIW, if you are curious, here's the initial commit of the XFS
> handle code into Irix tree from 3 Sep 1994, showing that the initial
> XFS open_by_handle() implementation sets the FINVIS flag to trigger
> invisible IO semantics:
>
> http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=575b66fae833429a51fcadb204d45521c2dfc26f

Thanks for sharing this. I haven't looked at the details you mentioned here.

>
> > I guess you are just copying from xfs code, but it still needs justification.
>
> "They are intended for use by a limited set of system
> utilities such as backup programs."
>
> - open_by_handle(3) man page
>

Should we retain all the above behaviour in the new syscall ?. Or just
do what a normal open(2) call does ?

-aneesh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Dave Chinner on 14 May 2010 06:50

On Thu, May 13, 2010 at 12:07:02PM +0530, Aneesh Kumar K. V wrote:
> On Thu, 13 May 2010 16:09:55 +1000, Dave Chinner <david(a)fromorbit.com> wrote:
> > On Thu, May 13, 2010 at 09:44:22AM +1000, Neil Brown wrote:
> > > On Wed, 12 May 2010 21:20:39 +0530
> > > "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote:
> > >
> > > > + filp->f_flags |= O_NOATIME;
> > > > + filp->f_mode |= FMODE_NOCMTIME;
> > > > + }
> > >
> > > I think you need a comment here explaining the rational for these setting.
> >
> > If you've never seen how applications use the XFS handle interface
> > in conjunction with other XFS functionality, then I guess if would
> > seem like bad voodoo.
> >
> > > Why is O_NOATIME important IFREG but not for IFDIR?
> >
> > No application has ever required directory access or modification
> > via the handle interface to be invisible to the rest of the system.
> >
> > > Why is it not sufficient to honour O_NOATIME that is passed in.
> >
> > Because the XFS handle library is cross platform and predates
> > O_NOATIME on linux. Hence the library it has never set that flag and
> > always relied on the kernel implementation of the API to ensure
> > atime was never updated on fds derived from handles..
> >
> > > How can you ever justify setting FMODE_NOCMTIME ?
> >
> > Quite easily. ;)
> >
> > The XFS handle interface was designed specifically to allow
> > applications to execute silent/invisible movement of data in, out
> > and around the filesystem without leaving user visible traces in
> > file metadata. This enables backup or filesysetm utilities that
> > operate on active filesystems need to be able to access or modify
> > inodes and data without affecting running applications. It's a
> > feature of the handle interface, and used by xfs_dump, xfs_fsr,
> > SGI's HSM, etc to do stuff that isn't otherwise possible.
> >
> > FWIW, if you are curious, here's the initial commit of the XFS
> > handle code into Irix tree from 3 Sep 1994, showing that the initial
> > XFS open_by_handle() implementation sets the FINVIS flag to trigger
> > invisible IO semantics:
> >
> > http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=575b66fae833429a51fcadb204d45521c2dfc26f
>
> Thanks for sharing this. I haven't looked at the details you mentioned here.
>
> >
> > > I guess you are just copying from xfs code, but it still needs justification.
> >
> > "They are intended for use by a limited set of system
> > utilities such as backup programs."
> >
> > - open_by_handle(3) man page
> >
>
> Should we retain all the above behaviour in the new syscall ?. Or just
> do what a normal open(2) call does ?

I'm not sure that FMODE_NOCMTIME can be set from userspace at the
moment. In fs.h:

82 /*
83 * Don't update ctime and mtime.
84 *
85 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
86 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
87 */
88 #define FMODE_NOCMTIME ((__force fmode_t)0x800)

Perhaps we need to introduce O_NOCMTIME as the comment suggests, and
then the new handle code doesn't need to automatically set it. If
libhandle is converted, then it could set the open flags as
necessary...

Cheers,

Dave.
--
Dave Chinner
david(a)fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Steve French on 14 May 2010 16:00

I think open by handle will turn out to be useful, but in discussing
various "duplicate inode number" checks that we are having to add to
cifs, it reminded me that we probably need a "generation number" or
some equivalent (birth time is probably good enough as well) to be
able to tell the case where a file is deleted and new file is created
reusing the same inode number (eventually Samba needs to return this
to posix clients if inode numbers are to be useful - and I don't know
how to tell Samba how to get birth time or generation numbers out of
stat in userspace)

--
Thanks,

Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

| Next | Last
Pages: 1 2
Prev: [PATCH 6/8] ceph: sync ceph_fs.h with the server tree version
Next: [PATCH] lib/atomic64_test: Fix missing include of linux/kernel.h