Prev: [PATCH 6/8] ceph: sync ceph_fs.h with the server tree version
Next: [PATCH] lib/atomic64_test: Fix missing include of linux/kernel.h
From: Neil Brown on 12 May 2010 19:50 On Wed, 12 May 2010 21:20:39 +0530 "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote: > Acked-by: Serge Hallyn <serue(a)us.ibm.com> > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.vnet.ibm.com> > --- > fs/namei.c | 24 --------- > fs/open.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/namei.h | 24 +++++++++ > 3 files changed, 160 insertions(+), 24 deletions(-) > > diff --git a/fs/namei.c b/fs/namei.c > index a7dce91..a18711e 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -1521,30 +1521,6 @@ out_unlock: > return may_open(&nd->path, 0, open_flag & ~O_TRUNC); > } > > -/* > - * Note that while the flag value (low two bits) for sys_open means: > - * 00 - read-only > - * 01 - write-only > - * 10 - read-write > - * 11 - special > - * it is changed into > - * 00 - no permissions needed > - * 01 - read-permission > - * 10 - write-permission > - * 11 - read-write > - * for the internal routines (ie open_namei()/follow_link() etc) > - * This is more logical, and also allows the 00 "no perm needed" > - * to be used for symlinks (where the permissions are checked > - * later). > - * > -*/ > -static inline int open_to_namei_flags(int flag) > -{ > - if ((flag+1) & O_ACCMODE) > - flag++; > - return flag; > -} > - > static int open_will_truncate(int flag, struct inode *inode) > { > /* > diff --git a/fs/open.c b/fs/open.c > index 9a34b81..348a1b9 100644 > --- a/fs/open.c > +++ b/fs/open.c > @@ -1315,3 +1315,139 @@ err_out: > asmlinkage_protect(4, ret, dfd, name, handle, flag); > return ret; > } > + > +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) > +{ > + return 1; > +} > + > +static struct dentry *handle_to_dentry(struct vfsmount *mnt, > + struct file_handle *handle) > +{ > + int handle_size; > + struct dentry *dentry; > + > + /* change the handle size to multiple of sizeof(u32) */ > + handle_size = handle->handle_size >> 2; > + dentry = exportfs_decode_fh(mnt, (struct fid *)handle->f_handle, > + handle_size, handle->handle_type, > + vfs_dentry_acceptable, NULL); > + return dentry; > +} > + > +static long do_sys_open_by_handle(struct file_handle __user *ufh, int flags) > +{ > + int fd; > + int retval = 0; > + int d_flags = flags; > + struct file *filp; > + struct vfsmount *mnt; > + struct inode *inode; > + struct dentry *dentry; > + struct file_handle f_handle; > + struct file_handle *handle = NULL; > + > + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { > + retval = -EFAULT; > + goto out_err; > + } > + if ((f_handle.handle_size > MAX_HANDLE_SZ) || > + (f_handle.handle_size <= 0)) { > + retval = -EINVAL; > + goto out_err; > + } > + if (!capable(CAP_DAC_OVERRIDE)) { > + retval = -EPERM; > + goto out_err; > + } > + /* > + * Find the vfsmount for this uuid in the > + * current namespace > + */ > + mnt = fs_get_vfsmount(current, &f_handle.fsid); > + if (!mnt) { > + retval = -ESTALE; > + goto out_err; > + } > + > + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size, > + GFP_KERNEL); > + if (!handle) { > + retval = -ENOMEM; > + goto out_mnt; > + } > + /* copy the full handle */ > + if (copy_from_user(handle, ufh, > + sizeof(struct file_handle) + > + f_handle.handle_size)) { > + retval = -EFAULT; > + goto out_mnt; > + } > + dentry = handle_to_dentry(mnt, handle); > + if (IS_ERR(dentry)) { > + retval = PTR_ERR(dentry); > + goto out_mnt; > + } > + inode = dentry->d_inode; > + flags = open_to_namei_flags(flags); > + /* O_TRUNC implies we need access checks for write permissions */ > + if (flags & O_TRUNC) > + flags |= MAY_WRITE; > + > + if ((!(flags & O_APPEND) || (flags & O_TRUNC)) && > + (flags & FMODE_WRITE) && IS_APPEND(inode)) { > + retval = -EPERM; > + goto out_dentry; > + } > + if ((flags & FMODE_WRITE) && IS_IMMUTABLE(inode)) { > + retval = -EACCES; > + goto out_dentry; > + } > + /* Can't write directories. */ > + if (S_ISDIR(inode->i_mode) && (flags & FMODE_WRITE)) { > + retval = -EISDIR; > + goto out_dentry; > + } Including all these checks inline here seems error prone. Can you not just use finish_open ?? It might do more than you need, but it would be more obvious that you didn't forget anything.. > + fd = get_unused_fd_flags(d_flags); > + if (fd < 0) { > + retval = fd; > + goto out_dentry; > + } > + filp = dentry_open(dget(dentry), mntget(mnt), > + d_flags, current_cred()); > + if (IS_ERR(filp)) { > + put_unused_fd(fd); > + retval = PTR_ERR(filp); > + goto out_dentry; > + } > + if (inode->i_mode & S_IFREG) { I suspect this is not the test you want. It tests for IFREG or IFLNK or IFSOCK. > + filp->f_flags |= O_NOATIME; > + filp->f_mode |= FMODE_NOCMTIME; > + } I think you need a comment here explaining the rational for these setting. Why is O_NOATIME important IFREG but not for IFDIR? Why is it not sufficient to honour O_NOATIME that is passed in. How can you ever justify setting FMODE_NOCMTIME ? I guess you are just copying from xfs code, but it still needs justification. NeilBrown > + fsnotify_open(filp->f_path.dentry); > + fd_install(fd, filp); > + retval = fd; > + > +out_dentry: > + dput(dentry); > +out_mnt: > + kfree(handle); > + mntput(mnt); > +out_err: > + return retval; > +} > + > +SYSCALL_DEFINE2(open_by_handle, struct file_handle __user *, handle, > + int, flags) > +{ > + long ret; > + > + if (force_o_largefile()) > + flags |= O_LARGEFILE; > + > + ret = do_sys_open_by_handle(handle, flags); > + > + /* avoid REGPARM breakage on x86: */ > + asmlinkage_protect(2, ret, handle, flags); > + return ret; > +} > diff --git a/include/linux/namei.h b/include/linux/namei.h > index 05b441d..a853aa0 100644 > --- a/include/linux/namei.h > +++ b/include/linux/namei.h > @@ -4,6 +4,7 @@ > #include <linux/dcache.h> > #include <linux/linkage.h> > #include <linux/path.h> > +#include <asm-generic/fcntl.h> > > struct vfsmount; > > @@ -96,4 +97,27 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) > ((char *) name)[min(len, maxlen)] = '\0'; > } > > +/* > + * Note that while the flag value (low two bits) for sys_open means: > + * 00 - read-only > + * 01 - write-only > + * 10 - read-write > + * 11 - special > + * it is changed into > + * 00 - no permissions needed > + * 01 - read-permission > + * 10 - write-permission > + * 11 - read-write > + * for the internal routines (ie open_namei()/follow_link() etc) > + * This is more logical, and also allows the 00 "no perm needed" > + * to be used for symlinks (where the permissions are checked > + * later). > + * > +*/ > +static inline int open_to_namei_flags(int flag) > +{ > + if ((flag+1) & O_ACCMODE) > + flag++; > + return flag; > +} > #endif /* _LINUX_NAMEI_H */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Dave Chinner on 13 May 2010 02:20 On Thu, May 13, 2010 at 09:44:22AM +1000, Neil Brown wrote: > On Wed, 12 May 2010 21:20:39 +0530 > "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote: > > > + filp->f_flags |= O_NOATIME; > > + filp->f_mode |= FMODE_NOCMTIME; > > + } > > I think you need a comment here explaining the rational for these setting. If you've never seen how applications use the XFS handle interface in conjunction with other XFS functionality, then I guess if would seem like bad voodoo. > Why is O_NOATIME important IFREG but not for IFDIR? No application has ever required directory access or modification via the handle interface to be invisible to the rest of the system. > Why is it not sufficient to honour O_NOATIME that is passed in. Because the XFS handle library is cross platform and predates O_NOATIME on linux. Hence the library it has never set that flag and always relied on the kernel implementation of the API to ensure atime was never updated on fds derived from handles.. > How can you ever justify setting FMODE_NOCMTIME ? Quite easily. ;) The XFS handle interface was designed specifically to allow applications to execute silent/invisible movement of data in, out and around the filesystem without leaving user visible traces in file metadata. This enables backup or filesysetm utilities that operate on active filesystems need to be able to access or modify inodes and data without affecting running applications. It's a feature of the handle interface, and used by xfs_dump, xfs_fsr, SGI's HSM, etc to do stuff that isn't otherwise possible. FWIW, if you are curious, here's the initial commit of the XFS handle code into Irix tree from 3 Sep 1994, showing that the initial XFS open_by_handle() implementation sets the FINVIS flag to trigger invisible IO semantics: http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=575b66fae833429a51fcadb204d45521c2dfc26f > I guess you are just copying from xfs code, but it still needs justification. "They are intended for use by a limited set of system utilities such as backup programs." - open_by_handle(3) man page Cheers, Dave. -- Dave Chinner david(a)fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Aneesh Kumar K. V on 13 May 2010 02:40 On Thu, 13 May 2010 16:09:55 +1000, Dave Chinner <david(a)fromorbit.com> wrote: > On Thu, May 13, 2010 at 09:44:22AM +1000, Neil Brown wrote: > > On Wed, 12 May 2010 21:20:39 +0530 > > "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote: > > > > > + filp->f_flags |= O_NOATIME; > > > + filp->f_mode |= FMODE_NOCMTIME; > > > + } > > > > I think you need a comment here explaining the rational for these setting. > > If you've never seen how applications use the XFS handle interface > in conjunction with other XFS functionality, then I guess if would > seem like bad voodoo. > > > Why is O_NOATIME important IFREG but not for IFDIR? > > No application has ever required directory access or modification > via the handle interface to be invisible to the rest of the system. > > > Why is it not sufficient to honour O_NOATIME that is passed in. > > Because the XFS handle library is cross platform and predates > O_NOATIME on linux. Hence the library it has never set that flag and > always relied on the kernel implementation of the API to ensure > atime was never updated on fds derived from handles.. > > > How can you ever justify setting FMODE_NOCMTIME ? > > Quite easily. ;) > > The XFS handle interface was designed specifically to allow > applications to execute silent/invisible movement of data in, out > and around the filesystem without leaving user visible traces in > file metadata. This enables backup or filesysetm utilities that > operate on active filesystems need to be able to access or modify > inodes and data without affecting running applications. It's a > feature of the handle interface, and used by xfs_dump, xfs_fsr, > SGI's HSM, etc to do stuff that isn't otherwise possible. > > FWIW, if you are curious, here's the initial commit of the XFS > handle code into Irix tree from 3 Sep 1994, showing that the initial > XFS open_by_handle() implementation sets the FINVIS flag to trigger > invisible IO semantics: > > http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=575b66fae833429a51fcadb204d45521c2dfc26f Thanks for sharing this. I haven't looked at the details you mentioned here. > > > I guess you are just copying from xfs code, but it still needs justification. > > "They are intended for use by a limited set of system > utilities such as backup programs." > > - open_by_handle(3) man page > Should we retain all the above behaviour in the new syscall ?. Or just do what a normal open(2) call does ? -aneesh -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Dave Chinner on 14 May 2010 06:50 On Thu, May 13, 2010 at 12:07:02PM +0530, Aneesh Kumar K. V wrote: > On Thu, 13 May 2010 16:09:55 +1000, Dave Chinner <david(a)fromorbit.com> wrote: > > On Thu, May 13, 2010 at 09:44:22AM +1000, Neil Brown wrote: > > > On Wed, 12 May 2010 21:20:39 +0530 > > > "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com> wrote: > > > > > > > + filp->f_flags |= O_NOATIME; > > > > + filp->f_mode |= FMODE_NOCMTIME; > > > > + } > > > > > > I think you need a comment here explaining the rational for these setting. > > > > If you've never seen how applications use the XFS handle interface > > in conjunction with other XFS functionality, then I guess if would > > seem like bad voodoo. > > > > > Why is O_NOATIME important IFREG but not for IFDIR? > > > > No application has ever required directory access or modification > > via the handle interface to be invisible to the rest of the system. > > > > > Why is it not sufficient to honour O_NOATIME that is passed in. > > > > Because the XFS handle library is cross platform and predates > > O_NOATIME on linux. Hence the library it has never set that flag and > > always relied on the kernel implementation of the API to ensure > > atime was never updated on fds derived from handles.. > > > > > How can you ever justify setting FMODE_NOCMTIME ? > > > > Quite easily. ;) > > > > The XFS handle interface was designed specifically to allow > > applications to execute silent/invisible movement of data in, out > > and around the filesystem without leaving user visible traces in > > file metadata. This enables backup or filesysetm utilities that > > operate on active filesystems need to be able to access or modify > > inodes and data without affecting running applications. It's a > > feature of the handle interface, and used by xfs_dump, xfs_fsr, > > SGI's HSM, etc to do stuff that isn't otherwise possible. > > > > FWIW, if you are curious, here's the initial commit of the XFS > > handle code into Irix tree from 3 Sep 1994, showing that the initial > > XFS open_by_handle() implementation sets the FINVIS flag to trigger > > invisible IO semantics: > > > > http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=575b66fae833429a51fcadb204d45521c2dfc26f > > Thanks for sharing this. I haven't looked at the details you mentioned here. > > > > > > I guess you are just copying from xfs code, but it still needs justification. > > > > "They are intended for use by a limited set of system > > utilities such as backup programs." > > > > - open_by_handle(3) man page > > > > Should we retain all the above behaviour in the new syscall ?. Or just > do what a normal open(2) call does ? I'm not sure that FMODE_NOCMTIME can be set from userspace at the moment. In fs.h: 82 /* 83 * Don't update ctime and mtime. 84 * 85 * Currently a special hack for the XFS open_by_handle ioctl, but we'll 86 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. 87 */ 88 #define FMODE_NOCMTIME ((__force fmode_t)0x800) Perhaps we need to introduce O_NOCMTIME as the comment suggests, and then the new handle code doesn't need to automatically set it. If libhandle is converted, then it could set the open flags as necessary... Cheers, Dave. -- Dave Chinner david(a)fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
From: Steve French on 14 May 2010 16:00
I think open by handle will turn out to be useful, but in discussing various "duplicate inode number" checks that we are having to add to cifs, it reminded me that we probably need a "generation number" or some equivalent (birth time is probably good enough as well) to be able to tell the case where a file is deleted and new file is created reusing the same inode number (eventually Samba needs to return this to posix clients if inode numbers are to be useful - and I don't know how to tell Samba how to get birth time or generation numbers out of stat in userspace) -- Thanks, Steve -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo(a)vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ |